Use the new downsampling implementation by default
-Switch back to the old implementation, if needed, with --use_legacy_downsampler -LocusIteratorByStateExperimental becomes the new LocusIteratorByState, and the original LocusIteratorByState becomes LegacyLocusIteratorByState -Similarly, the ExperimentalReadShardBalancer becomes the new ReadShardBalancer, with the old one renamed to LegacyReadShardBalancer -Performance improvements: locus traversals used to be 20% slower in the new downsampling implementation, now they are roughly the same speed. -Tests show a very high level of concordance with UG calls from the previous implementation, with some new calls and edge cases that still require more examination. -With the new implementation, can now use -dcov with ReadWalkers to set a limit on the max # of reads per alignment start position per sample. Appropriate value for ReadWalker dcov may be in the single digits for some tools, but this too requires more investigation.
This commit is contained in:
parent
5460c96137
commit
46edab6d6a
|
|
@ -80,11 +80,11 @@ public class UnifiedGenotyperGeneralPloidyIntegrationTest extends WalkerTest {
|
|||
|
||||
@Test(enabled = true)
|
||||
public void testMT_SNP_DISCOVERY_sp4() {
|
||||
PC_MT_Test(CEUTRIO_BAM, " -maxAltAlleles 1 -ploidy 8", "MT_SNP_DISCOVERY_sp4","dd568dc30be90135a3a8957a45a7321c");
|
||||
PC_MT_Test(CEUTRIO_BAM, " -maxAltAlleles 1 -ploidy 8", "MT_SNP_DISCOVERY_sp4","3fc6f4d458313616727c60e49c0e852b");
|
||||
}
|
||||
|
||||
@Test(enabled = true)
|
||||
public void testMT_SNP_GGA_sp10() {
|
||||
PC_MT_Test(CEUTRIO_BAM, String.format(" -maxAltAlleles 1 -ploidy 20 -gt_mode GENOTYPE_GIVEN_ALLELES -out_mode EMIT_ALL_SITES -alleles %s",NA12891_CALLS), "MT_SNP_GGA_sp10", "bf793c43b635a931207170be8035b288");
|
||||
PC_MT_Test(CEUTRIO_BAM, String.format(" -maxAltAlleles 1 -ploidy 20 -gt_mode GENOTYPE_GIVEN_ALLELES -out_mode EMIT_ALL_SITES -alleles %s",NA12891_CALLS), "MT_SNP_GGA_sp10", "1bebbc0f28bff6fd64736ccca8839df8");
|
||||
}
|
||||
}
|
||||
|
|
|
|||
|
|
@ -62,7 +62,7 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest {
|
|||
public void testMultipleSNPAlleles() {
|
||||
WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec(
|
||||
"-T UnifiedGenotyper -R " + b37KGReference + " --no_cmdline_in_header -glm BOTH --dbsnp " + b37dbSNP129 + " -I " + privateTestDir + "multiallelic.snps.bam -o %s -L " + privateTestDir + "multiallelic.snps.intervals", 1,
|
||||
Arrays.asList("d20c7a143b899f0239bf64b652ad3edb"));
|
||||
Arrays.asList("97df6c2a8d390d43b9bdf56c979d9b09"));
|
||||
executeTest("test Multiple SNP alleles", spec);
|
||||
}
|
||||
|
||||
|
|
@ -86,7 +86,7 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest {
|
|||
public void testMismatchedPLs() {
|
||||
WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec(
|
||||
"-T UnifiedGenotyper -R " + b37KGReference + " --no_cmdline_in_header -glm INDEL -I " + privateTestDir + "mismatchedPLs.bam -o %s -L 1:24020341", 1,
|
||||
Arrays.asList("fb204e821a24d03bd3a671b6e01c449a"));
|
||||
Arrays.asList("935ee705ffe8cc6bf1d9efcceea271c8"));
|
||||
executeTest("test mismatched PLs", spec);
|
||||
}
|
||||
|
||||
|
|
@ -437,7 +437,7 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest {
|
|||
public void testNsInCigar() {
|
||||
WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec(
|
||||
"-T UnifiedGenotyper -R " + b37KGReference + " --no_cmdline_in_header -I " + privateTestDir + "testWithNs.bam -o %s -L 8:141813600-141813700 -out_mode EMIT_ALL_SITES", 1,
|
||||
Arrays.asList("32f18ba50406cd8c8069ba07f2f89558"));
|
||||
Arrays.asList("4d36969d4f8f1094f1fb6e7e085c19f6"));
|
||||
executeTest("test calling on reads with Ns in CIGAR", spec);
|
||||
}
|
||||
|
||||
|
|
@ -451,13 +451,13 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest {
|
|||
public void testReducedBam() {
|
||||
WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec(
|
||||
"-T UnifiedGenotyper -R " + b37KGReference + " --no_cmdline_in_header -I " + privateTestDir + "bamExample.ReducedRead.ADAnnotation.bam -o %s -L 1:67,225,396-67,288,518", 1,
|
||||
Arrays.asList("c1077662411164182c5f75478344f83d"));
|
||||
Arrays.asList("092e42a712afb660ec79ff11c55933e2"));
|
||||
executeTest("test calling on a ReducedRead BAM", spec);
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testReducedBamSNPs() {
|
||||
testReducedCalling("SNP", "dee6590e3b7079890bc3a9cb372c297e");
|
||||
testReducedCalling("SNP", "c0de74ab8f4f14eb3a2c5d55c200ac5f");
|
||||
}
|
||||
|
||||
@Test
|
||||
|
|
|
|||
|
|
@ -445,13 +445,17 @@ public class GenomeAnalysisEngine {
|
|||
|
||||
protected DownsamplingMethod getDownsamplingMethod() {
|
||||
GATKArgumentCollection argCollection = this.getArguments();
|
||||
boolean useExperimentalDownsampling = argCollection.enableExperimentalDownsampling;
|
||||
|
||||
// Legacy downsampler can only be selected via the command line, not via walker annotations
|
||||
boolean useLegacyDownsampler = argCollection.useLegacyDownsampler;
|
||||
|
||||
DownsamplingMethod commandLineMethod = argCollection.getDownsamplingMethod();
|
||||
DownsamplingMethod walkerMethod = WalkerManager.getDownsamplingMethod(walker, useExperimentalDownsampling);
|
||||
DownsamplingMethod defaultMethod = DownsamplingMethod.getDefaultDownsamplingMethod(walker, useExperimentalDownsampling);
|
||||
DownsamplingMethod walkerMethod = WalkerManager.getDownsamplingMethod(walker, useLegacyDownsampler);
|
||||
DownsamplingMethod defaultMethod = DownsamplingMethod.getDefaultDownsamplingMethod(walker, useLegacyDownsampler);
|
||||
|
||||
return commandLineMethod != null ? commandLineMethod : (walkerMethod != null ? walkerMethod : defaultMethod);
|
||||
DownsamplingMethod method = commandLineMethod != null ? commandLineMethod : (walkerMethod != null ? walkerMethod : defaultMethod);
|
||||
method.checkCompatibilityWithWalker(walker);
|
||||
return method;
|
||||
}
|
||||
|
||||
protected void setDownsamplingMethod(DownsamplingMethod method) {
|
||||
|
|
@ -580,9 +584,9 @@ public class GenomeAnalysisEngine {
|
|||
throw new UserException.CommandLineException("Pairs traversal cannot be used in conjunction with intervals.");
|
||||
}
|
||||
|
||||
// Use the experimental ReadShardBalancer if experimental downsampling is enabled
|
||||
ShardBalancer readShardBalancer = downsamplingMethod != null && downsamplingMethod.useExperimentalDownsampling ?
|
||||
new ExperimentalReadShardBalancer() :
|
||||
// Use the legacy ReadShardBalancer if legacy downsampling is enabled
|
||||
ShardBalancer readShardBalancer = downsamplingMethod != null && downsamplingMethod.useLegacyDownsampler ?
|
||||
new LegacyReadShardBalancer() :
|
||||
new ReadShardBalancer();
|
||||
|
||||
if(intervals == null)
|
||||
|
|
|
|||
|
|
@ -305,11 +305,23 @@ public class WalkerManager extends PluginManager<Walker> {
|
|||
* Gets the type of downsampling method requested by the walker. If an alternative
|
||||
* downsampling method is specified on the command-line, the command-line version will
|
||||
* be used instead.
|
||||
* @param walkerClass The class of the walker to interrogate.
|
||||
* @param useExperimentalDownsampling If true, use the experimental downsampling implementation
|
||||
* @param walker The walker to interrogate.
|
||||
* @param useLegacyDownsampler If true, use the legacy downsampling implementation
|
||||
* @return The downsampling method, as specified by the walker. Null if none exists.
|
||||
*/
|
||||
public static DownsamplingMethod getDownsamplingMethod(Class<? extends Walker> walkerClass, boolean useExperimentalDownsampling) {
|
||||
public static DownsamplingMethod getDownsamplingMethod(Walker walker, boolean useLegacyDownsampler) {
|
||||
return getDownsamplingMethod(walker.getClass(), useLegacyDownsampler);
|
||||
}
|
||||
|
||||
/**
|
||||
* Gets the type of downsampling method requested by the walker. If an alternative
|
||||
* downsampling method is specified on the command-line, the command-line version will
|
||||
* be used instead.
|
||||
* @param walkerClass The class of the walker to interrogate.
|
||||
* @param useLegacyDownsampler If true, use the legacy downsampling implementation
|
||||
* @return The downsampling method, as specified by the walker. Null if none exists.
|
||||
*/
|
||||
public static DownsamplingMethod getDownsamplingMethod(Class<? extends Walker> walkerClass, boolean useLegacyDownsampler) {
|
||||
DownsamplingMethod downsamplingMethod = null;
|
||||
|
||||
if( walkerClass.isAnnotationPresent(Downsample.class) ) {
|
||||
|
|
@ -317,7 +329,7 @@ public class WalkerManager extends PluginManager<Walker> {
|
|||
DownsampleType type = downsampleParameters.by();
|
||||
Integer toCoverage = downsampleParameters.toCoverage() >= 0 ? downsampleParameters.toCoverage() : null;
|
||||
Double toFraction = downsampleParameters.toFraction() >= 0.0d ? downsampleParameters.toFraction() : null;
|
||||
downsamplingMethod = new DownsamplingMethod(type,toCoverage,toFraction,useExperimentalDownsampling);
|
||||
downsamplingMethod = new DownsamplingMethod(type,toCoverage,toFraction,useLegacyDownsampler);
|
||||
}
|
||||
|
||||
return downsamplingMethod;
|
||||
|
|
@ -331,18 +343,6 @@ public class WalkerManager extends PluginManager<Walker> {
|
|||
return walker.getClass().getAnnotation(BAQMode.class).ApplicationTime();
|
||||
}
|
||||
|
||||
/**
|
||||
* Gets the type of downsampling method requested by the walker. If an alternative
|
||||
* downsampling method is specified on the command-line, the command-line version will
|
||||
* be used instead.
|
||||
* @param walker The walker to interrogate.
|
||||
* @param useExperimentalDownsampling If true, use the experimental downsampling implementation
|
||||
* @return The downsampling method, as specified by the walker. Null if none exists.
|
||||
*/
|
||||
public static DownsamplingMethod getDownsamplingMethod(Walker walker, boolean useExperimentalDownsampling) {
|
||||
return getDownsamplingMethod(walker.getClass(), useExperimentalDownsampling);
|
||||
}
|
||||
|
||||
/**
|
||||
* Create a name for this type of walker.
|
||||
*
|
||||
|
|
|
|||
|
|
@ -162,12 +162,11 @@ public class GATKArgumentCollection {
|
|||
@Argument(fullName = "downsample_to_fraction", shortName = "dfrac", doc = "Fraction [0.0-1.0] of reads to downsample to", required = false)
|
||||
public Double downsampleFraction = null;
|
||||
|
||||
@Argument(fullName = "downsample_to_coverage", shortName = "dcov", doc = "Coverage [integer] to downsample to at any given locus; note that downsampled reads are randomly selected from all possible reads at a locus", required = false)
|
||||
@Argument(fullName = "downsample_to_coverage", shortName = "dcov", doc = "Coverage [integer] to downsample to at any given locus; note that downsampled reads are randomly selected from all possible reads at a locus. For non-locus-based traversals (eg., ReadWalkers), this sets the maximum number of reads at each alignment start position.", required = false)
|
||||
public Integer downsampleCoverage = null;
|
||||
|
||||
@Argument(fullName = "enable_experimental_downsampling", shortName = "enable_experimental_downsampling", doc = "Enable experimental engine-level downsampling", required = false)
|
||||
@Hidden
|
||||
public boolean enableExperimentalDownsampling = false;
|
||||
@Argument(fullName = "use_legacy_downsampler", shortName = "use_legacy_downsampler", doc = "Use the legacy downsampling implementation instead of the newer, less-tested implementation", required = false)
|
||||
public boolean useLegacyDownsampler = false;
|
||||
|
||||
/**
|
||||
* Gets the downsampling method explicitly specified by the user. If the user didn't specify
|
||||
|
|
@ -178,7 +177,7 @@ public class GATKArgumentCollection {
|
|||
if ( downsamplingType == null && downsampleFraction == null && downsampleCoverage == null )
|
||||
return null;
|
||||
|
||||
return new DownsamplingMethod(downsamplingType, downsampleCoverage, downsampleFraction, enableExperimentalDownsampling);
|
||||
return new DownsamplingMethod(downsamplingType, downsampleCoverage, downsampleFraction, useLegacyDownsampler);
|
||||
}
|
||||
|
||||
/**
|
||||
|
|
@ -192,7 +191,7 @@ public class GATKArgumentCollection {
|
|||
downsamplingType = method.type;
|
||||
downsampleCoverage = method.toCoverage;
|
||||
downsampleFraction = method.toFraction;
|
||||
enableExperimentalDownsampling = method.useExperimentalDownsampling;
|
||||
useLegacyDownsampler = method.useLegacyDownsampler;
|
||||
}
|
||||
|
||||
// --------------------------------------------------------------------------------------------------------------
|
||||
|
|
|
|||
|
|
@ -136,11 +136,12 @@ public abstract class LocusView extends LocusIterator implements View {
|
|||
// Cache the current and apply filtering.
|
||||
AlignmentContext current = nextLocus;
|
||||
|
||||
// The old ALL_READS downsampling implementation -- only use if we're not using the new experimental downsampling:
|
||||
if( ! sourceInfo.getDownsamplingMethod().useExperimentalDownsampling &&
|
||||
sourceInfo.getDownsamplingMethod().type == DownsampleType.ALL_READS && sourceInfo.getDownsamplingMethod().toCoverage != null ) {
|
||||
// The old ALL_READS downsampling implementation -- use only if legacy downsampling was requested:
|
||||
if ( sourceInfo.getDownsamplingMethod().useLegacyDownsampler &&
|
||||
sourceInfo.getDownsamplingMethod().type == DownsampleType.ALL_READS &&
|
||||
sourceInfo.getDownsamplingMethod().toCoverage != null ) {
|
||||
|
||||
current.downsampleToCoverage( sourceInfo.getDownsamplingMethod().toCoverage );
|
||||
current.downsampleToCoverage(sourceInfo.getDownsamplingMethod().toCoverage);
|
||||
}
|
||||
|
||||
// Indicate that the next operation will need to advance.
|
||||
|
|
|
|||
|
|
@ -134,12 +134,11 @@ public class BAMScheduler implements Iterator<FilePointer> {
|
|||
|
||||
// Only use the deprecated SAMDataSource.getCurrentPosition() if we're not using experimental downsampling
|
||||
// TODO: clean this up once the experimental downsampling engine fork collapses
|
||||
if ( dataSource.getReadsInfo().getDownsamplingMethod() != null && dataSource.getReadsInfo().getDownsamplingMethod().useExperimentalDownsampling ) {
|
||||
currentPosition = dataSource.getInitialReaderPositions();
|
||||
if ( dataSource.getReadsInfo().getDownsamplingMethod() != null && dataSource.getReadsInfo().getDownsamplingMethod().useLegacyDownsampler ) {
|
||||
currentPosition = dataSource.getCurrentPosition();
|
||||
}
|
||||
else {
|
||||
currentPosition = dataSource.getCurrentPosition();
|
||||
|
||||
currentPosition = dataSource.getInitialReaderPositions();
|
||||
}
|
||||
|
||||
for(SAMReaderID reader: dataSource.getReaderIDs())
|
||||
|
|
|
|||
|
|
@ -1,228 +0,0 @@
|
|||
/*
|
||||
* Copyright (c) 2012, The Broad Institute
|
||||
*
|
||||
* Permission is hereby granted, free of charge, to any person
|
||||
* obtaining a copy of this software and associated documentation
|
||||
* files (the "Software"), to deal in the Software without
|
||||
* restriction, including without limitation the rights to use,
|
||||
* copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
* copies of the Software, and to permit persons to whom the
|
||||
* Software is furnished to do so, subject to the following
|
||||
* conditions:
|
||||
*
|
||||
* The above copyright notice and this permission notice shall be
|
||||
* included in all copies or substantial portions of the Software.
|
||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
||||
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
|
||||
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
||||
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
|
||||
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
|
||||
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
||||
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
|
||||
* OTHER DEALINGS IN THE SOFTWARE.
|
||||
*/
|
||||
|
||||
package org.broadinstitute.sting.gatk.datasources.reads;
|
||||
|
||||
import net.sf.picard.util.PeekableIterator;
|
||||
import net.sf.samtools.SAMRecord;
|
||||
import org.apache.log4j.Logger;
|
||||
import org.broadinstitute.sting.utils.exceptions.ReviewedStingException;
|
||||
|
||||
import java.util.*;
|
||||
|
||||
/**
|
||||
* Convert from an unbalanced iterator over FilePointers to a balanced iterator over Shards.
|
||||
*
|
||||
* When processing FilePointers, our strategy is to aggregate all FilePointers for each contig
|
||||
* together into one monolithic FilePointer, create one persistent set of read iterators over
|
||||
* that monolithic FilePointer, and repeatedly use that persistent set of read iterators to
|
||||
* fill read shards with reads.
|
||||
*
|
||||
* This strategy has several important advantages:
|
||||
*
|
||||
* 1. We avoid issues with file span overlap. FilePointers that are more granular than a whole
|
||||
* contig will have regions that overlap with other FilePointers on the same contig, due
|
||||
* to the limited granularity of BAM index data. By creating only one FilePointer per contig,
|
||||
* we avoid having to track how much of each file region we've visited (as we did in the
|
||||
* former implementation), we avoid expensive non-sequential access patterns in the files,
|
||||
* and we avoid having to repeatedly re-create our iterator chain for every small region
|
||||
* of interest.
|
||||
*
|
||||
* 2. We avoid boundary issues with the engine-level downsampling. Since we create a single
|
||||
* persistent set of read iterators (which include the downsampling iterator(s)) per contig,
|
||||
* the downsampling process is never interrupted by FilePointer or Shard boundaries, and never
|
||||
* loses crucial state information while downsampling within a contig.
|
||||
*
|
||||
* TODO: There is also at least one important disadvantage:
|
||||
*
|
||||
* 1. We load more BAM index data into memory at once, and this work is done upfront before processing
|
||||
* the next contig, creating a delay before traversal of each contig. This delay may be
|
||||
* compensated for by the gains listed in #1 above, and we may be no worse off overall in
|
||||
* terms of total runtime, but we need to verify this empirically.
|
||||
*
|
||||
* @author David Roazen
|
||||
*/
|
||||
public class ExperimentalReadShardBalancer extends ShardBalancer {
|
||||
|
||||
private static Logger logger = Logger.getLogger(ExperimentalReadShardBalancer.class);
|
||||
|
||||
/**
|
||||
* Convert iterators of file pointers into balanced iterators of shards.
|
||||
* @return An iterator over balanced shards.
|
||||
*/
|
||||
public Iterator<Shard> iterator() {
|
||||
return new Iterator<Shard>() {
|
||||
/**
|
||||
* The cached shard to be returned next. Prefetched in the peekable iterator style.
|
||||
*/
|
||||
private Shard nextShard = null;
|
||||
|
||||
/**
|
||||
* The file pointer currently being processed.
|
||||
*/
|
||||
private FilePointer currentContigFilePointer = null;
|
||||
|
||||
/**
|
||||
* Iterator over the reads from the current contig's file pointer. The same iterator will be
|
||||
* used to fill all shards associated with a given file pointer
|
||||
*/
|
||||
private PeekableIterator<SAMRecord> currentContigReadsIterator = null;
|
||||
|
||||
/**
|
||||
* How many FilePointers have we pulled from the filePointers iterator?
|
||||
*/
|
||||
private int totalFilePointersConsumed = 0;
|
||||
|
||||
/**
|
||||
* Have we encountered a monolithic FilePointer?
|
||||
*/
|
||||
private boolean encounteredMonolithicFilePointer = false;
|
||||
|
||||
|
||||
{
|
||||
createNextContigFilePointer();
|
||||
advance();
|
||||
}
|
||||
|
||||
public boolean hasNext() {
|
||||
return nextShard != null;
|
||||
}
|
||||
|
||||
public Shard next() {
|
||||
if ( ! hasNext() )
|
||||
throw new NoSuchElementException("No next read shard available");
|
||||
Shard currentShard = nextShard;
|
||||
advance();
|
||||
return currentShard;
|
||||
}
|
||||
|
||||
private void advance() {
|
||||
nextShard = null;
|
||||
|
||||
// May need multiple iterations to fill the next shard if all reads in current file spans get filtered/downsampled away
|
||||
while ( nextShard == null && currentContigFilePointer != null ) {
|
||||
|
||||
// If we've exhausted the current file pointer of reads, move to the next file pointer (if there is one):
|
||||
if ( currentContigReadsIterator != null && ! currentContigReadsIterator.hasNext() ) {
|
||||
|
||||
// Close the old, exhausted chain of iterators to release resources
|
||||
currentContigReadsIterator.close();
|
||||
|
||||
// Advance to the FilePointer for the next contig
|
||||
createNextContigFilePointer();
|
||||
|
||||
// We'll need to create a fresh iterator for this file pointer when we create the first
|
||||
// shard for it below.
|
||||
currentContigReadsIterator = null;
|
||||
}
|
||||
|
||||
// At this point our currentContigReadsIterator may be null or non-null depending on whether or not
|
||||
// this is our first shard for this file pointer.
|
||||
if ( currentContigFilePointer != null ) {
|
||||
Shard shard = new ReadShard(parser,readsDataSource, currentContigFilePointer.fileSpans, currentContigFilePointer.locations, currentContigFilePointer.isRegionUnmapped);
|
||||
|
||||
// Create a new reads iterator only when we've just advanced to the file pointer for the next
|
||||
// contig. It's essential that the iterators persist across all shards that share the same contig
|
||||
// to allow the downsampling to work properly.
|
||||
if ( currentContigReadsIterator == null ) {
|
||||
currentContigReadsIterator = new PeekableIterator<SAMRecord>(readsDataSource.getIterator(shard));
|
||||
}
|
||||
|
||||
if ( currentContigReadsIterator.hasNext() ) {
|
||||
shard.fill(currentContigReadsIterator);
|
||||
nextShard = shard;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Aggregate all FilePointers for the next contig together into one monolithic FilePointer
|
||||
* to avoid boundary issues with visiting the same file regions more than once (since more
|
||||
* granular FilePointers will have regions that overlap with other nearby FilePointers due
|
||||
* to the nature of BAM indices).
|
||||
*
|
||||
* By creating one persistent set of iterators per contig we also avoid boundary artifacts
|
||||
* in the engine-level downsampling.
|
||||
*
|
||||
* TODO: This FilePointer aggregation should ideally be done at the BAMSchedule level for
|
||||
* TODO: read traversals, as there's little point in the BAMSchedule emitting extremely
|
||||
* TODO: granular FilePointers if we're just going to union them. The BAMSchedule should
|
||||
* TODO: emit one FilePointer per contig for read traversals (but, crucially, NOT for
|
||||
* TODO: locus traversals).
|
||||
*/
|
||||
private void createNextContigFilePointer() {
|
||||
currentContigFilePointer = null;
|
||||
List<FilePointer> nextContigFilePointers = new ArrayList<FilePointer>();
|
||||
|
||||
logger.info("Loading BAM index data for next contig");
|
||||
|
||||
while ( filePointers.hasNext() ) {
|
||||
|
||||
// Make sure that if we see a monolithic FilePointer (representing all regions in all files) that
|
||||
// it is the ONLY FilePointer we ever encounter
|
||||
if ( encounteredMonolithicFilePointer ) {
|
||||
throw new ReviewedStingException("Bug: encountered additional FilePointers after encountering a monolithic FilePointer");
|
||||
}
|
||||
if ( filePointers.peek().isMonolithic() ) {
|
||||
if ( totalFilePointersConsumed > 0 ) {
|
||||
throw new ReviewedStingException("Bug: encountered additional FilePointers before encountering a monolithic FilePointer");
|
||||
}
|
||||
encounteredMonolithicFilePointer = true;
|
||||
logger.debug(String.format("Encountered monolithic FilePointer: %s", filePointers.peek()));
|
||||
}
|
||||
|
||||
// If this is the first FP we've seen, or we're dealing with mapped regions and the next FP is on the
|
||||
// same contig as previous FPs, or all our FPs are unmapped, add the next FP to the list of FPs to merge
|
||||
if ( nextContigFilePointers.isEmpty() ||
|
||||
(! nextContigFilePointers.get(0).isRegionUnmapped && ! filePointers.peek().isRegionUnmapped &&
|
||||
nextContigFilePointers.get(0).getContigIndex() == filePointers.peek().getContigIndex()) ||
|
||||
(nextContigFilePointers.get(0).isRegionUnmapped && filePointers.peek().isRegionUnmapped) ) {
|
||||
|
||||
nextContigFilePointers.add(filePointers.next());
|
||||
totalFilePointersConsumed++;
|
||||
}
|
||||
else {
|
||||
break; // next FilePointer is on a different contig or has different mapped/unmapped status,
|
||||
// save it for next time
|
||||
}
|
||||
}
|
||||
|
||||
if ( ! nextContigFilePointers.isEmpty() ) {
|
||||
currentContigFilePointer = FilePointer.union(nextContigFilePointers, parser);
|
||||
}
|
||||
|
||||
if ( currentContigFilePointer != null ) {
|
||||
logger.info("Done loading BAM index data for next contig");
|
||||
logger.debug(String.format("Next contig FilePointer: %s", currentContigFilePointer));
|
||||
}
|
||||
}
|
||||
|
||||
public void remove() {
|
||||
throw new UnsupportedOperationException("Unable to remove from shard balancing iterator");
|
||||
}
|
||||
};
|
||||
}
|
||||
|
||||
}
|
||||
|
|
@ -0,0 +1,129 @@
|
|||
/*
|
||||
* Copyright (c) 2011, The Broad Institute
|
||||
*
|
||||
* Permission is hereby granted, free of charge, to any person
|
||||
* obtaining a copy of this software and associated documentation
|
||||
* files (the "Software"), to deal in the Software without
|
||||
* restriction, including without limitation the rights to use,
|
||||
* copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
* copies of the Software, and to permit persons to whom the
|
||||
* Software is furnished to do so, subject to the following
|
||||
* conditions:
|
||||
*
|
||||
* The above copyright notice and this permission notice shall be
|
||||
* included in all copies or substantial portions of the Software.
|
||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
||||
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
|
||||
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
||||
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
|
||||
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
|
||||
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
||||
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
|
||||
* OTHER DEALINGS IN THE SOFTWARE.
|
||||
*/
|
||||
|
||||
package org.broadinstitute.sting.gatk.datasources.reads;
|
||||
|
||||
import net.sf.samtools.GATKBAMFileSpan;
|
||||
import net.sf.samtools.SAMFileSpan;
|
||||
|
||||
import java.util.HashMap;
|
||||
import java.util.Iterator;
|
||||
import java.util.Map;
|
||||
import java.util.NoSuchElementException;
|
||||
|
||||
/**
|
||||
* Divide up large file pointers containing reads into more manageable subcomponents.
|
||||
*
|
||||
* TODO: delete this class once the experimental downsampling engine fork collapses
|
||||
*/
|
||||
public class LegacyReadShardBalancer extends ShardBalancer {
|
||||
/**
|
||||
* Convert iterators of file pointers into balanced iterators of shards.
|
||||
* @return An iterator over balanced shards.
|
||||
*/
|
||||
public Iterator<Shard> iterator() {
|
||||
return new Iterator<Shard>() {
|
||||
/**
|
||||
* The cached shard to be returned next. Prefetched in the peekable iterator style.
|
||||
*/
|
||||
private Shard nextShard = null;
|
||||
|
||||
/**
|
||||
* The file pointer currently being processed.
|
||||
*/
|
||||
private FilePointer currentFilePointer;
|
||||
|
||||
/**
|
||||
* Ending position of the last shard in the file.
|
||||
*/
|
||||
private Map<SAMReaderID,GATKBAMFileSpan> position = readsDataSource.getCurrentPosition();
|
||||
|
||||
{
|
||||
if(filePointers.hasNext())
|
||||
currentFilePointer = filePointers.next();
|
||||
advance();
|
||||
}
|
||||
|
||||
public boolean hasNext() {
|
||||
return nextShard != null;
|
||||
}
|
||||
|
||||
public Shard next() {
|
||||
if(!hasNext())
|
||||
throw new NoSuchElementException("No next read shard available");
|
||||
Shard currentShard = nextShard;
|
||||
advance();
|
||||
return currentShard;
|
||||
}
|
||||
|
||||
public void remove() {
|
||||
throw new UnsupportedOperationException("Unable to remove from shard balancing iterator");
|
||||
}
|
||||
|
||||
private void advance() {
|
||||
Map<SAMReaderID,SAMFileSpan> shardPosition;
|
||||
nextShard = null;
|
||||
|
||||
Map<SAMReaderID,SAMFileSpan> selectedReaders = new HashMap<SAMReaderID,SAMFileSpan>();
|
||||
while(selectedReaders.size() == 0 && currentFilePointer != null) {
|
||||
shardPosition = currentFilePointer.fileSpans;
|
||||
|
||||
for(SAMReaderID id: shardPosition.keySet()) {
|
||||
SAMFileSpan fileSpan = new GATKBAMFileSpan(shardPosition.get(id).removeContentsBefore(position.get(id)));
|
||||
selectedReaders.put(id,fileSpan);
|
||||
}
|
||||
|
||||
if(!isEmpty(selectedReaders)) {
|
||||
Shard shard = new ReadShard(parser,readsDataSource,selectedReaders,currentFilePointer.locations,currentFilePointer.isRegionUnmapped);
|
||||
readsDataSource.fillShard(shard);
|
||||
|
||||
if(!shard.isBufferEmpty()) {
|
||||
nextShard = shard;
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
selectedReaders.clear();
|
||||
currentFilePointer = filePointers.hasNext() ? filePointers.next() : null;
|
||||
}
|
||||
|
||||
position = readsDataSource.getCurrentPosition();
|
||||
}
|
||||
|
||||
/**
|
||||
* Detects whether the list of file spans contain any read data.
|
||||
* @param selectedSpans Mapping of readers to file spans.
|
||||
* @return True if file spans are completely empty; false otherwise.
|
||||
*/
|
||||
private boolean isEmpty(Map<SAMReaderID,SAMFileSpan> selectedSpans) {
|
||||
for(SAMFileSpan fileSpan: selectedSpans.values()) {
|
||||
if(!fileSpan.isEmpty())
|
||||
return false;
|
||||
}
|
||||
return true;
|
||||
}
|
||||
};
|
||||
}
|
||||
|
||||
}
|
||||
|
|
@ -1,5 +1,5 @@
|
|||
/*
|
||||
* Copyright (c) 2011, The Broad Institute
|
||||
* Copyright (c) 2012, The Broad Institute
|
||||
*
|
||||
* Permission is hereby granted, free of charge, to any person
|
||||
* obtaining a copy of this software and associated documentation
|
||||
|
|
@ -24,20 +24,49 @@
|
|||
|
||||
package org.broadinstitute.sting.gatk.datasources.reads;
|
||||
|
||||
import net.sf.samtools.GATKBAMFileSpan;
|
||||
import net.sf.samtools.SAMFileSpan;
|
||||
import net.sf.picard.util.PeekableIterator;
|
||||
import net.sf.samtools.SAMRecord;
|
||||
import org.apache.log4j.Logger;
|
||||
import org.broadinstitute.sting.utils.exceptions.ReviewedStingException;
|
||||
|
||||
import java.util.HashMap;
|
||||
import java.util.Iterator;
|
||||
import java.util.Map;
|
||||
import java.util.NoSuchElementException;
|
||||
import java.util.*;
|
||||
|
||||
/**
|
||||
* Divide up large file pointers containing reads into more manageable subcomponents.
|
||||
* Convert from an unbalanced iterator over FilePointers to a balanced iterator over Shards.
|
||||
*
|
||||
* TODO: delete this class once the experimental downsampling engine fork collapses
|
||||
* When processing FilePointers, our strategy is to aggregate all FilePointers for each contig
|
||||
* together into one monolithic FilePointer, create one persistent set of read iterators over
|
||||
* that monolithic FilePointer, and repeatedly use that persistent set of read iterators to
|
||||
* fill read shards with reads.
|
||||
*
|
||||
* This strategy has several important advantages:
|
||||
*
|
||||
* 1. We avoid issues with file span overlap. FilePointers that are more granular than a whole
|
||||
* contig will have regions that overlap with other FilePointers on the same contig, due
|
||||
* to the limited granularity of BAM index data. By creating only one FilePointer per contig,
|
||||
* we avoid having to track how much of each file region we've visited (as we did in the
|
||||
* former implementation), we avoid expensive non-sequential access patterns in the files,
|
||||
* and we avoid having to repeatedly re-create our iterator chain for every small region
|
||||
* of interest.
|
||||
*
|
||||
* 2. We avoid boundary issues with the engine-level downsampling. Since we create a single
|
||||
* persistent set of read iterators (which include the downsampling iterator(s)) per contig,
|
||||
* the downsampling process is never interrupted by FilePointer or Shard boundaries, and never
|
||||
* loses crucial state information while downsampling within a contig.
|
||||
*
|
||||
* TODO: There is also at least one important disadvantage:
|
||||
*
|
||||
* 1. We load more BAM index data into memory at once, and this work is done upfront before processing
|
||||
* the next contig, creating a delay before traversal of each contig. This delay may be
|
||||
* compensated for by the gains listed in #1 above, and we may be no worse off overall in
|
||||
* terms of total runtime, but we need to verify this empirically.
|
||||
*
|
||||
* @author David Roazen
|
||||
*/
|
||||
public class ReadShardBalancer extends ShardBalancer {
|
||||
|
||||
private static Logger logger = Logger.getLogger(ReadShardBalancer.class);
|
||||
|
||||
/**
|
||||
* Convert iterators of file pointers into balanced iterators of shards.
|
||||
* @return An iterator over balanced shards.
|
||||
|
|
@ -52,16 +81,27 @@ public class ReadShardBalancer extends ShardBalancer {
|
|||
/**
|
||||
* The file pointer currently being processed.
|
||||
*/
|
||||
private FilePointer currentFilePointer;
|
||||
private FilePointer currentContigFilePointer = null;
|
||||
|
||||
/**
|
||||
* Ending position of the last shard in the file.
|
||||
* Iterator over the reads from the current contig's file pointer. The same iterator will be
|
||||
* used to fill all shards associated with a given file pointer
|
||||
*/
|
||||
private Map<SAMReaderID,GATKBAMFileSpan> position = readsDataSource.getCurrentPosition();
|
||||
private PeekableIterator<SAMRecord> currentContigReadsIterator = null;
|
||||
|
||||
/**
|
||||
* How many FilePointers have we pulled from the filePointers iterator?
|
||||
*/
|
||||
private int totalFilePointersConsumed = 0;
|
||||
|
||||
/**
|
||||
* Have we encountered a monolithic FilePointer?
|
||||
*/
|
||||
private boolean encounteredMonolithicFilePointer = false;
|
||||
|
||||
|
||||
{
|
||||
if(filePointers.hasNext())
|
||||
currentFilePointer = filePointers.next();
|
||||
createNextContigFilePointer();
|
||||
advance();
|
||||
}
|
||||
|
||||
|
|
@ -70,58 +110,117 @@ public class ReadShardBalancer extends ShardBalancer {
|
|||
}
|
||||
|
||||
public Shard next() {
|
||||
if(!hasNext())
|
||||
if ( ! hasNext() )
|
||||
throw new NoSuchElementException("No next read shard available");
|
||||
Shard currentShard = nextShard;
|
||||
advance();
|
||||
return currentShard;
|
||||
}
|
||||
|
||||
public void remove() {
|
||||
throw new UnsupportedOperationException("Unable to remove from shard balancing iterator");
|
||||
}
|
||||
|
||||
private void advance() {
|
||||
Map<SAMReaderID,SAMFileSpan> shardPosition;
|
||||
nextShard = null;
|
||||
|
||||
Map<SAMReaderID,SAMFileSpan> selectedReaders = new HashMap<SAMReaderID,SAMFileSpan>();
|
||||
while(selectedReaders.size() == 0 && currentFilePointer != null) {
|
||||
shardPosition = currentFilePointer.fileSpans;
|
||||
// May need multiple iterations to fill the next shard if all reads in current file spans get filtered/downsampled away
|
||||
while ( nextShard == null && currentContigFilePointer != null ) {
|
||||
|
||||
for(SAMReaderID id: shardPosition.keySet()) {
|
||||
SAMFileSpan fileSpan = new GATKBAMFileSpan(shardPosition.get(id).removeContentsBefore(position.get(id)));
|
||||
selectedReaders.put(id,fileSpan);
|
||||
// If we've exhausted the current file pointer of reads, move to the next file pointer (if there is one):
|
||||
if ( currentContigReadsIterator != null && ! currentContigReadsIterator.hasNext() ) {
|
||||
|
||||
// Close the old, exhausted chain of iterators to release resources
|
||||
currentContigReadsIterator.close();
|
||||
|
||||
// Advance to the FilePointer for the next contig
|
||||
createNextContigFilePointer();
|
||||
|
||||
// We'll need to create a fresh iterator for this file pointer when we create the first
|
||||
// shard for it below.
|
||||
currentContigReadsIterator = null;
|
||||
}
|
||||
|
||||
if(!isEmpty(selectedReaders)) {
|
||||
Shard shard = new ReadShard(parser,readsDataSource,selectedReaders,currentFilePointer.locations,currentFilePointer.isRegionUnmapped);
|
||||
readsDataSource.fillShard(shard);
|
||||
// At this point our currentContigReadsIterator may be null or non-null depending on whether or not
|
||||
// this is our first shard for this file pointer.
|
||||
if ( currentContigFilePointer != null ) {
|
||||
Shard shard = new ReadShard(parser,readsDataSource, currentContigFilePointer.fileSpans, currentContigFilePointer.locations, currentContigFilePointer.isRegionUnmapped);
|
||||
|
||||
if(!shard.isBufferEmpty()) {
|
||||
// Create a new reads iterator only when we've just advanced to the file pointer for the next
|
||||
// contig. It's essential that the iterators persist across all shards that share the same contig
|
||||
// to allow the downsampling to work properly.
|
||||
if ( currentContigReadsIterator == null ) {
|
||||
currentContigReadsIterator = new PeekableIterator<SAMRecord>(readsDataSource.getIterator(shard));
|
||||
}
|
||||
|
||||
if ( currentContigReadsIterator.hasNext() ) {
|
||||
shard.fill(currentContigReadsIterator);
|
||||
nextShard = shard;
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
selectedReaders.clear();
|
||||
currentFilePointer = filePointers.hasNext() ? filePointers.next() : null;
|
||||
}
|
||||
|
||||
position = readsDataSource.getCurrentPosition();
|
||||
}
|
||||
|
||||
/**
|
||||
* Detects whether the list of file spans contain any read data.
|
||||
* @param selectedSpans Mapping of readers to file spans.
|
||||
* @return True if file spans are completely empty; false otherwise.
|
||||
* Aggregate all FilePointers for the next contig together into one monolithic FilePointer
|
||||
* to avoid boundary issues with visiting the same file regions more than once (since more
|
||||
* granular FilePointers will have regions that overlap with other nearby FilePointers due
|
||||
* to the nature of BAM indices).
|
||||
*
|
||||
* By creating one persistent set of iterators per contig we also avoid boundary artifacts
|
||||
* in the engine-level downsampling.
|
||||
*
|
||||
* TODO: This FilePointer aggregation should ideally be done at the BAMSchedule level for
|
||||
* TODO: read traversals, as there's little point in the BAMSchedule emitting extremely
|
||||
* TODO: granular FilePointers if we're just going to union them. The BAMSchedule should
|
||||
* TODO: emit one FilePointer per contig for read traversals (but, crucially, NOT for
|
||||
* TODO: locus traversals).
|
||||
*/
|
||||
private boolean isEmpty(Map<SAMReaderID,SAMFileSpan> selectedSpans) {
|
||||
for(SAMFileSpan fileSpan: selectedSpans.values()) {
|
||||
if(!fileSpan.isEmpty())
|
||||
return false;
|
||||
private void createNextContigFilePointer() {
|
||||
currentContigFilePointer = null;
|
||||
List<FilePointer> nextContigFilePointers = new ArrayList<FilePointer>();
|
||||
|
||||
logger.info("Loading BAM index data for next contig");
|
||||
|
||||
while ( filePointers.hasNext() ) {
|
||||
|
||||
// Make sure that if we see a monolithic FilePointer (representing all regions in all files) that
|
||||
// it is the ONLY FilePointer we ever encounter
|
||||
if ( encounteredMonolithicFilePointer ) {
|
||||
throw new ReviewedStingException("Bug: encountered additional FilePointers after encountering a monolithic FilePointer");
|
||||
}
|
||||
if ( filePointers.peek().isMonolithic() ) {
|
||||
if ( totalFilePointersConsumed > 0 ) {
|
||||
throw new ReviewedStingException("Bug: encountered additional FilePointers before encountering a monolithic FilePointer");
|
||||
}
|
||||
encounteredMonolithicFilePointer = true;
|
||||
logger.debug(String.format("Encountered monolithic FilePointer: %s", filePointers.peek()));
|
||||
}
|
||||
|
||||
// If this is the first FP we've seen, or we're dealing with mapped regions and the next FP is on the
|
||||
// same contig as previous FPs, or all our FPs are unmapped, add the next FP to the list of FPs to merge
|
||||
if ( nextContigFilePointers.isEmpty() ||
|
||||
(! nextContigFilePointers.get(0).isRegionUnmapped && ! filePointers.peek().isRegionUnmapped &&
|
||||
nextContigFilePointers.get(0).getContigIndex() == filePointers.peek().getContigIndex()) ||
|
||||
(nextContigFilePointers.get(0).isRegionUnmapped && filePointers.peek().isRegionUnmapped) ) {
|
||||
|
||||
nextContigFilePointers.add(filePointers.next());
|
||||
totalFilePointersConsumed++;
|
||||
}
|
||||
else {
|
||||
break; // next FilePointer is on a different contig or has different mapped/unmapped status,
|
||||
// save it for next time
|
||||
}
|
||||
}
|
||||
return true;
|
||||
|
||||
if ( ! nextContigFilePointers.isEmpty() ) {
|
||||
currentContigFilePointer = FilePointer.union(nextContigFilePointers, parser);
|
||||
}
|
||||
|
||||
if ( currentContigFilePointer != null ) {
|
||||
logger.info("Done loading BAM index data for next contig");
|
||||
logger.debug(String.format("Next contig FilePointer: %s", currentContigFilePointer));
|
||||
}
|
||||
}
|
||||
|
||||
public void remove() {
|
||||
throw new UnsupportedOperationException("Unable to remove from shard balancing iterator");
|
||||
}
|
||||
};
|
||||
}
|
||||
|
|
|
|||
|
|
@ -466,7 +466,7 @@ public class SAMDataSource {
|
|||
/**
|
||||
* Legacy method to fill the given buffering shard with reads.
|
||||
*
|
||||
* Shard.fill() is used instead of this method when experimental downsampling is enabled
|
||||
* Shard.fill() is used instead of this method unless legacy downsampling is enabled
|
||||
*
|
||||
* TODO: delete this method once the experimental downsampling engine fork collapses
|
||||
*
|
||||
|
|
@ -638,7 +638,8 @@ public class SAMDataSource {
|
|||
readProperties.getValidationExclusionList().contains(ValidationExclusion.TYPE.NO_READ_ORDER_VERIFICATION),
|
||||
readProperties.getSupplementalFilters(),
|
||||
readProperties.getReadTransformers(),
|
||||
readProperties.defaultBaseQualities());
|
||||
readProperties.defaultBaseQualities(),
|
||||
shard instanceof LocusShard);
|
||||
}
|
||||
|
||||
private class BAMCodecIterator implements CloseableIterator<SAMRecord> {
|
||||
|
|
@ -695,6 +696,7 @@ public class SAMDataSource {
|
|||
* @param noValidationOfReadOrder Another trigger for the verifying iterator? TODO: look into this.
|
||||
* @param supplementalFilters additional filters to apply to the reads.
|
||||
* @param defaultBaseQualities if the reads have incomplete quality scores, set them all to defaultBaseQuality.
|
||||
* @param isLocusBasedTraversal true if we're dealing with a read stream from a LocusShard
|
||||
* @return An iterator wrapped with filters reflecting the passed-in parameters. Will not be null.
|
||||
*/
|
||||
protected StingSAMIterator applyDecoratingIterators(ReadMetrics readMetrics,
|
||||
|
|
@ -705,7 +707,8 @@ public class SAMDataSource {
|
|||
Boolean noValidationOfReadOrder,
|
||||
Collection<ReadFilter> supplementalFilters,
|
||||
List<ReadTransformer> readTransformers,
|
||||
byte defaultBaseQualities) {
|
||||
byte defaultBaseQualities,
|
||||
boolean isLocusBasedTraversal ) {
|
||||
|
||||
// ************************************************************************************************ //
|
||||
// * NOTE: ALL FILTERING/DOWNSAMPLING SHOULD BE DONE BEFORE ANY ITERATORS THAT MODIFY THE READS! * //
|
||||
|
|
@ -714,12 +717,26 @@ public class SAMDataSource {
|
|||
|
||||
wrappedIterator = StingSAMIteratorAdapter.adapt(new CountingFilteringIterator(readMetrics,wrappedIterator,supplementalFilters));
|
||||
|
||||
if ( readProperties.getDownsamplingMethod().useExperimentalDownsampling ) {
|
||||
wrappedIterator = applyDownsamplingIterator(wrappedIterator);
|
||||
// If we're using the new downsampling implementation, apply downsampling iterators at this
|
||||
// point in the read stream for most (but not all) cases
|
||||
if ( ! readProperties.getDownsamplingMethod().useLegacyDownsampler ) {
|
||||
|
||||
// For locus traversals where we're downsampling to coverage by sample, assume that the downsamplers
|
||||
// will be invoked downstream from us in LocusIteratorByState. This improves performance by avoiding
|
||||
// splitting/re-assembly of the read stream at this stage, and also allows for partial downsampling
|
||||
// of individual reads.
|
||||
boolean assumeDownstreamLIBSDownsampling = isLocusBasedTraversal &&
|
||||
readProperties.getDownsamplingMethod().type == DownsampleType.BY_SAMPLE &&
|
||||
readProperties.getDownsamplingMethod().toCoverage != null;
|
||||
|
||||
if ( ! assumeDownstreamLIBSDownsampling ) {
|
||||
wrappedIterator = applyDownsamplingIterator(wrappedIterator);
|
||||
}
|
||||
}
|
||||
|
||||
// Use the old fractional downsampler only if we're not using experimental downsampling:
|
||||
if ( ! readProperties.getDownsamplingMethod().useExperimentalDownsampling && downsamplingFraction != null )
|
||||
// Use the old fractional downsampler only if we're using legacy downsampling:
|
||||
// TODO: remove this statement (and associated classes) once the downsampling engine fork collapses
|
||||
if ( readProperties.getDownsamplingMethod().useLegacyDownsampler && downsamplingFraction != null )
|
||||
wrappedIterator = new LegacyDownsampleIterator(wrappedIterator, downsamplingFraction);
|
||||
|
||||
// unless they've said not to validate read ordering (!noValidationOfReadOrder) and we've enabled verification,
|
||||
|
|
@ -741,19 +758,37 @@ public class SAMDataSource {
|
|||
}
|
||||
|
||||
protected StingSAMIterator applyDownsamplingIterator( StingSAMIterator wrappedIterator ) {
|
||||
if ( readProperties.getDownsamplingMethod().type == DownsampleType.BY_SAMPLE ) {
|
||||
ReadsDownsamplerFactory<SAMRecord> downsamplerFactory = readProperties.getDownsamplingMethod().toCoverage != null ?
|
||||
new SimplePositionalDownsamplerFactory<SAMRecord>(readProperties.getDownsamplingMethod().toCoverage) :
|
||||
new FractionalDownsamplerFactory<SAMRecord>(readProperties.getDownsamplingMethod().toFraction);
|
||||
|
||||
return new PerSampleDownsamplingReadsIterator(wrappedIterator, downsamplerFactory);
|
||||
if ( readProperties.getDownsamplingMethod() == null ||
|
||||
readProperties.getDownsamplingMethod().type == DownsampleType.NONE ) {
|
||||
return wrappedIterator;
|
||||
}
|
||||
else if ( readProperties.getDownsamplingMethod().type == DownsampleType.ALL_READS ) {
|
||||
ReadsDownsampler<SAMRecord> downsampler = readProperties.getDownsamplingMethod().toCoverage != null ?
|
||||
new SimplePositionalDownsampler<SAMRecord>(readProperties.getDownsamplingMethod().toCoverage) :
|
||||
new FractionalDownsampler<SAMRecord>(readProperties.getDownsamplingMethod().toFraction);
|
||||
|
||||
return new DownsamplingReadsIterator(wrappedIterator, downsampler);
|
||||
if ( readProperties.getDownsamplingMethod().toFraction != null ) {
|
||||
|
||||
// If we're downsampling to a fraction of reads, there's no point in paying the cost of
|
||||
// splitting/re-assembling the read stream by sample to run the FractionalDownsampler on
|
||||
// reads from each sample separately, since the result would be the same as running the
|
||||
// FractionalDownsampler on the entire stream. So, ALWAYS use the DownsamplingReadsIterator
|
||||
// rather than the PerSampleDownsamplingReadsIterator, even if BY_SAMPLE downsampling
|
||||
// was requested.
|
||||
|
||||
return new DownsamplingReadsIterator(wrappedIterator,
|
||||
new FractionalDownsampler<SAMRecord>(readProperties.getDownsamplingMethod().toFraction));
|
||||
}
|
||||
else if ( readProperties.getDownsamplingMethod().toCoverage != null ) {
|
||||
|
||||
// If we're downsampling to coverage, we DO need to pay the cost of splitting/re-assembling
|
||||
// the read stream to run the downsampler on the reads for each individual sample separately if
|
||||
// BY_SAMPLE downsampling was requested.
|
||||
|
||||
if ( readProperties.getDownsamplingMethod().type == DownsampleType.BY_SAMPLE ) {
|
||||
return new PerSampleDownsamplingReadsIterator(wrappedIterator,
|
||||
new SimplePositionalDownsamplerFactory<SAMRecord>(readProperties.getDownsamplingMethod().toCoverage));
|
||||
}
|
||||
else if ( readProperties.getDownsamplingMethod().type == DownsampleType.ALL_READS ) {
|
||||
return new DownsamplingReadsIterator(wrappedIterator,
|
||||
new SimplePositionalDownsampler<SAMRecord>(readProperties.getDownsamplingMethod().toCoverage));
|
||||
}
|
||||
}
|
||||
|
||||
return wrappedIterator;
|
||||
|
|
|
|||
|
|
@ -50,9 +50,9 @@ public class DownsamplingMethod {
|
|||
public final Double toFraction;
|
||||
|
||||
/**
|
||||
* Use the new experimental downsampling?
|
||||
* Use the legacy downsampling implementation instead of the newer implementation?
|
||||
*/
|
||||
public final boolean useExperimentalDownsampling;
|
||||
public final boolean useLegacyDownsampler;
|
||||
|
||||
/**
|
||||
* Expresses no downsampling applied at all.
|
||||
|
|
@ -69,11 +69,11 @@ public class DownsamplingMethod {
|
|||
*/
|
||||
public static int DEFAULT_LOCUS_BASED_TRAVERSAL_DOWNSAMPLING_COVERAGE = 1000;
|
||||
|
||||
public DownsamplingMethod( DownsampleType type, Integer toCoverage, Double toFraction, boolean useExperimentalDownsampling ) {
|
||||
public DownsamplingMethod( DownsampleType type, Integer toCoverage, Double toFraction, boolean useLegacyDownsampler ) {
|
||||
this.type = type != null ? type : DEFAULT_DOWNSAMPLING_TYPE;
|
||||
this.toCoverage = toCoverage;
|
||||
this.toFraction = toFraction;
|
||||
this.useExperimentalDownsampling = useExperimentalDownsampling;
|
||||
this.useLegacyDownsampler = useLegacyDownsampler;
|
||||
|
||||
if ( type == DownsampleType.NONE ) {
|
||||
toCoverage = null;
|
||||
|
|
@ -101,19 +101,19 @@ public class DownsamplingMethod {
|
|||
if ( toFraction != null && (toFraction < 0.0 || toFraction > 1.0) ) {
|
||||
throw new UserException.CommandLineException("toFraction must be >= 0.0 and <= 1.0 when downsampling to a fraction of reads");
|
||||
}
|
||||
}
|
||||
|
||||
// Some restrictions only exist for the old downsampling implementation:
|
||||
if ( ! useExperimentalDownsampling ) {
|
||||
// By sample downsampling does not work with a fraction of reads in the old downsampling implementation
|
||||
if( type == DownsampleType.BY_SAMPLE && toFraction != null )
|
||||
throw new UserException.CommandLineException("Cannot downsample to fraction with the BY_SAMPLE method");
|
||||
public void checkCompatibilityWithWalker( Walker walker ) {
|
||||
boolean isLocusTraversal = walker instanceof LocusWalker || walker instanceof ActiveRegionWalker;
|
||||
|
||||
if ( ! isLocusTraversal && useLegacyDownsampler && toCoverage != null ) {
|
||||
throw new UserException.CommandLineException("Downsampling to coverage for read-based traversals (eg., ReadWalkers) is not supported in the legacy downsampling implementation. " +
|
||||
"The newer downsampling implementation does not have this limitation.");
|
||||
}
|
||||
|
||||
// Some restrictions only exist for the new downsampling implementation:
|
||||
if ( useExperimentalDownsampling ) {
|
||||
if ( type == DownsampleType.ALL_READS && toCoverage != null ) {
|
||||
throw new UserException.CommandLineException("Cannot downsample to coverage with the ALL_READS method in the experimental downsampling implementation");
|
||||
}
|
||||
if ( isLocusTraversal && ! useLegacyDownsampler && type == DownsampleType.ALL_READS && toCoverage != null ) {
|
||||
throw new UserException.CommandLineException("Downsampling to coverage with the ALL_READS method for locus-based traversals (eg., LocusWalkers) is not yet supported in the new downsampling implementation (though it is supported for ReadWalkers). " +
|
||||
"You can run with --use_legacy_downsampler for a broken and poorly-maintained implementation of ALL_READS to-coverage downsampling, but this is not recommended.");
|
||||
}
|
||||
}
|
||||
|
||||
|
|
@ -124,30 +124,34 @@ public class DownsamplingMethod {
|
|||
builder.append("No downsampling");
|
||||
}
|
||||
else {
|
||||
builder.append(String.format("Method: %s ", type));
|
||||
builder.append(String.format("Method: %s, ", type));
|
||||
|
||||
if ( toCoverage != null ) {
|
||||
builder.append(String.format("Target Coverage: %d ", toCoverage));
|
||||
builder.append(String.format("Target Coverage: %d, ", toCoverage));
|
||||
}
|
||||
else {
|
||||
builder.append(String.format("Target Fraction: %.2f ", toFraction));
|
||||
builder.append(String.format("Target Fraction: %.2f, ", toFraction));
|
||||
}
|
||||
|
||||
if ( useExperimentalDownsampling ) {
|
||||
builder.append("Using Experimental Downsampling");
|
||||
if ( useLegacyDownsampler ) {
|
||||
builder.append("Using the legacy downsampling implementation");
|
||||
}
|
||||
else {
|
||||
builder.append("Using the new downsampling implementation");
|
||||
}
|
||||
}
|
||||
|
||||
return builder.toString();
|
||||
}
|
||||
|
||||
public static DownsamplingMethod getDefaultDownsamplingMethod( Walker walker, boolean useExperimentalDownsampling ) {
|
||||
public static DownsamplingMethod getDefaultDownsamplingMethod( Walker walker, boolean useLegacyDownsampler ) {
|
||||
if ( walker instanceof LocusWalker || walker instanceof ActiveRegionWalker ) {
|
||||
return new DownsamplingMethod(DEFAULT_DOWNSAMPLING_TYPE, DEFAULT_LOCUS_BASED_TRAVERSAL_DOWNSAMPLING_COVERAGE,
|
||||
null, useExperimentalDownsampling);
|
||||
null, useLegacyDownsampler);
|
||||
}
|
||||
else {
|
||||
return new DownsamplingMethod(DownsampleType.NONE, null, null, useExperimentalDownsampling);
|
||||
// Downsampling is off by default for non-locus-based traversals
|
||||
return new DownsamplingMethod(DownsampleType.NONE, null, null, useLegacyDownsampler);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
|
|||
|
|
@ -0,0 +1,106 @@
|
|||
/*
|
||||
* Copyright (c) 2012, The Broad Institute
|
||||
*
|
||||
* Permission is hereby granted, free of charge, to any person
|
||||
* obtaining a copy of this software and associated documentation
|
||||
* files (the "Software"), to deal in the Software without
|
||||
* restriction, including without limitation the rights to use,
|
||||
* copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
* copies of the Software, and to permit persons to whom the
|
||||
* Software is furnished to do so, subject to the following
|
||||
* conditions:
|
||||
*
|
||||
* The above copyright notice and this permission notice shall be
|
||||
* included in all copies or substantial portions of the Software.
|
||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
||||
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
|
||||
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
||||
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
|
||||
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
|
||||
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
||||
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
|
||||
* OTHER DEALINGS IN THE SOFTWARE.
|
||||
*/
|
||||
|
||||
package org.broadinstitute.sting.gatk.downsampling;
|
||||
|
||||
import net.sf.samtools.SAMRecord;
|
||||
|
||||
import java.util.ArrayList;
|
||||
import java.util.Collection;
|
||||
import java.util.List;
|
||||
|
||||
/**
|
||||
* Pass-Through Downsampler: Implementation of the ReadsDownsampler interface that does no
|
||||
* downsampling whatsoever, and instead simply "passes-through" all the reads it's given.
|
||||
* Useful for situations where you want to disable downsampling, but still need to use
|
||||
* the downsampler interface.
|
||||
*
|
||||
* @author David Roazen
|
||||
*/
|
||||
public class PassThroughDownsampler<T extends SAMRecord> implements ReadsDownsampler<T> {
|
||||
|
||||
private ArrayList<T> selectedReads;
|
||||
|
||||
public PassThroughDownsampler() {
|
||||
clear();
|
||||
}
|
||||
|
||||
public void submit( T newRead ) {
|
||||
// All reads pass-through, no reads get downsampled
|
||||
selectedReads.add(newRead);
|
||||
}
|
||||
|
||||
public void submit( Collection<T> newReads ) {
|
||||
for ( T read : newReads ) {
|
||||
submit(read);
|
||||
}
|
||||
}
|
||||
|
||||
public boolean hasFinalizedItems() {
|
||||
return selectedReads.size() > 0;
|
||||
}
|
||||
|
||||
public List<T> consumeFinalizedItems() {
|
||||
// pass by reference rather than make a copy, for speed
|
||||
List<T> downsampledItems = selectedReads;
|
||||
clear();
|
||||
return downsampledItems;
|
||||
}
|
||||
|
||||
public boolean hasPendingItems() {
|
||||
return false;
|
||||
}
|
||||
|
||||
public T peekFinalized() {
|
||||
return selectedReads.isEmpty() ? null : selectedReads.get(0);
|
||||
}
|
||||
|
||||
public T peekPending() {
|
||||
return null;
|
||||
}
|
||||
|
||||
public int getNumberOfDiscardedItems() {
|
||||
return 0;
|
||||
}
|
||||
|
||||
public void signalEndOfInput() {
|
||||
// NO-OP
|
||||
}
|
||||
|
||||
public void clear() {
|
||||
selectedReads = new ArrayList<T>();
|
||||
}
|
||||
|
||||
public void reset() {
|
||||
// NO-OP
|
||||
}
|
||||
|
||||
public boolean requiresCoordinateSortOrder() {
|
||||
return false;
|
||||
}
|
||||
|
||||
public void signalNoMoreReadsBefore( T read ) {
|
||||
// NO-OP
|
||||
}
|
||||
}
|
||||
|
|
@ -4,9 +4,9 @@ import net.sf.picard.util.PeekableIterator;
|
|||
import org.broadinstitute.sting.gatk.ReadProperties;
|
||||
import org.broadinstitute.sting.gatk.contexts.AlignmentContext;
|
||||
import org.broadinstitute.sting.gatk.datasources.reads.Shard;
|
||||
import org.broadinstitute.sting.gatk.iterators.LegacyLocusIteratorByState;
|
||||
import org.broadinstitute.sting.gatk.iterators.LocusIterator;
|
||||
import org.broadinstitute.sting.gatk.iterators.LocusIteratorByState;
|
||||
import org.broadinstitute.sting.gatk.iterators.LocusIteratorByStateExperimental;
|
||||
import org.broadinstitute.sting.gatk.iterators.StingSAMIterator;
|
||||
import org.broadinstitute.sting.utils.GenomeLoc;
|
||||
import org.broadinstitute.sting.utils.GenomeLocParser;
|
||||
|
|
@ -83,17 +83,18 @@ public class WindowMaker implements Iterable<WindowMaker.WindowMakerIterator>, I
|
|||
this.sourceInfo = shard.getReadProperties();
|
||||
this.readIterator = iterator;
|
||||
|
||||
// Temporary: use the experimental version of LocusIteratorByState if experimental downsampling was requested:
|
||||
this.sourceIterator = sourceInfo.getDownsamplingMethod().useExperimentalDownsampling ?
|
||||
new PeekableIterator<AlignmentContext>(new LocusIteratorByStateExperimental(iterator,sourceInfo,genomeLocParser, sampleNames))
|
||||
// Use the legacy version of LocusIteratorByState if legacy downsampling was requested:
|
||||
this.sourceIterator = sourceInfo.getDownsamplingMethod().useLegacyDownsampler ?
|
||||
new PeekableIterator<AlignmentContext>(new LegacyLocusIteratorByState(iterator,sourceInfo,genomeLocParser,sampleNames))
|
||||
:
|
||||
new PeekableIterator<AlignmentContext>(new LocusIteratorByState(iterator,sourceInfo,genomeLocParser, sampleNames));
|
||||
new PeekableIterator<AlignmentContext>(new LocusIteratorByState(iterator,sourceInfo,genomeLocParser,sampleNames));
|
||||
|
||||
|
||||
this.intervalIterator = intervals.size()>0 ? new PeekableIterator<GenomeLoc>(intervals.iterator()) : null;
|
||||
}
|
||||
|
||||
public WindowMaker(Shard shard, GenomeLocParser genomeLocParser, StingSAMIterator iterator, List<GenomeLoc> intervals ) {
|
||||
this(shard, genomeLocParser, iterator, intervals, LocusIteratorByState.sampleListForSAMWithoutReadGroups());
|
||||
this(shard, genomeLocParser, iterator, intervals, LegacyLocusIteratorByState.sampleListForSAMWithoutReadGroups());
|
||||
}
|
||||
|
||||
public Iterator<WindowMakerIterator> iterator() {
|
||||
|
|
|
|||
|
|
@ -31,14 +31,14 @@ import net.sf.samtools.CigarElement;
|
|||
import net.sf.samtools.CigarOperator;
|
||||
import net.sf.samtools.SAMRecord;
|
||||
import org.apache.log4j.Logger;
|
||||
import org.broadinstitute.sting.gatk.downsampling.DownsampleType;
|
||||
import org.broadinstitute.sting.gatk.downsampling.DownsamplingMethod;
|
||||
import org.broadinstitute.sting.gatk.ReadProperties;
|
||||
import org.broadinstitute.sting.gatk.contexts.AlignmentContext;
|
||||
import org.broadinstitute.sting.gatk.downsampling.DownsampleType;
|
||||
import org.broadinstitute.sting.gatk.downsampling.Downsampler;
|
||||
import org.broadinstitute.sting.gatk.downsampling.LevelingDownsampler;
|
||||
import org.broadinstitute.sting.utils.GenomeLoc;
|
||||
import org.broadinstitute.sting.utils.GenomeLocParser;
|
||||
import org.broadinstitute.sting.utils.exceptions.ReviewedStingException;
|
||||
import org.broadinstitute.sting.utils.MathUtils;
|
||||
import org.broadinstitute.sting.utils.LegacyReservoirDownsampler;
|
||||
import org.broadinstitute.sting.utils.exceptions.UserException;
|
||||
import org.broadinstitute.sting.utils.pileup.PileupElement;
|
||||
import org.broadinstitute.sting.utils.pileup.ReadBackedPileupImpl;
|
||||
|
|
@ -50,11 +50,11 @@ import java.util.*;
|
|||
/**
|
||||
* Iterator that traverses a SAM File, accumulating information on a per-locus basis
|
||||
*/
|
||||
public class LocusIteratorByStateExperimental extends LocusIterator {
|
||||
public class LegacyLocusIteratorByState extends LocusIterator {
|
||||
/**
|
||||
* our log, which we want to capture anything from this class
|
||||
*/
|
||||
private static Logger logger = Logger.getLogger(LocusIteratorByState.class);
|
||||
private static Logger logger = Logger.getLogger(LegacyLocusIteratorByState.class);
|
||||
|
||||
// -----------------------------------------------------------------------------------------------------------------
|
||||
//
|
||||
|
|
@ -69,7 +69,7 @@ public class LocusIteratorByStateExperimental extends LocusIterator {
|
|||
private final ArrayList<String> samples;
|
||||
private final ReadStateManager readStates;
|
||||
|
||||
protected static class SAMRecordState {
|
||||
static private class SAMRecordState {
|
||||
SAMRecord read;
|
||||
int readOffset = -1; // how far are we offset from the start of the read bases?
|
||||
int genomeOffset = -1; // how far are we offset from the alignment start on the genome?
|
||||
|
|
@ -213,7 +213,6 @@ public class LocusIteratorByStateExperimental extends LocusIterator {
|
|||
//final boolean DEBUG2 = false && DEBUG;
|
||||
private ReadProperties readInfo;
|
||||
private AlignmentContext nextAlignmentContext;
|
||||
private boolean performLevelingDownsampling;
|
||||
|
||||
// -----------------------------------------------------------------------------------------------------------------
|
||||
//
|
||||
|
|
@ -221,15 +220,11 @@ public class LocusIteratorByStateExperimental extends LocusIterator {
|
|||
//
|
||||
// -----------------------------------------------------------------------------------------------------------------
|
||||
|
||||
public LocusIteratorByStateExperimental(final Iterator<SAMRecord> samIterator, ReadProperties readInformation, GenomeLocParser genomeLocParser, Collection<String> samples) {
|
||||
public LegacyLocusIteratorByState(final Iterator<SAMRecord> samIterator, ReadProperties readInformation, GenomeLocParser genomeLocParser, Collection<String> samples) {
|
||||
this.readInfo = readInformation;
|
||||
this.genomeLocParser = genomeLocParser;
|
||||
this.samples = new ArrayList<String>(samples);
|
||||
this.readStates = new ReadStateManager(samIterator);
|
||||
|
||||
this.performLevelingDownsampling = readInfo.getDownsamplingMethod() != null &&
|
||||
readInfo.getDownsamplingMethod().type == DownsampleType.BY_SAMPLE &&
|
||||
readInfo.getDownsamplingMethod().toCoverage != null;
|
||||
this.readStates = new ReadStateManager(samIterator, readInformation.getDownsamplingMethod());
|
||||
|
||||
// currently the GATK expects this LocusIteratorByState to accept empty sample lists, when
|
||||
// there's no read data. So we need to throw this error only when samIterator.hasNext() is true
|
||||
|
|
@ -290,13 +285,11 @@ public class LocusIteratorByStateExperimental extends LocusIterator {
|
|||
|
||||
final GenomeLoc location = getLocation();
|
||||
final Map<String, ReadBackedPileupImpl> fullPileup = new HashMap<String, ReadBackedPileupImpl>();
|
||||
|
||||
// TODO: How can you determine here whether the current pileup has been downsampled?
|
||||
boolean hasBeenSampled = false;
|
||||
|
||||
for (final String sample : samples) {
|
||||
final Iterator<SAMRecordState> iterator = readStates.iterator(sample);
|
||||
final List<PileupElement> pile = new ArrayList<PileupElement>(readStates.size(sample));
|
||||
hasBeenSampled |= location.getStart() <= readStates.getDownsamplingExtent(sample);
|
||||
|
||||
int size = 0; // number of elements in this sample's pileup
|
||||
int nDeletions = 0; // number of deletions in this sample's pileup
|
||||
|
|
@ -405,20 +398,34 @@ public class LocusIteratorByStateExperimental extends LocusIterator {
|
|||
throw new UnsupportedOperationException("Can not remove records from a SAM file via an iterator!");
|
||||
}
|
||||
|
||||
protected class ReadStateManager {
|
||||
private class ReadStateManager {
|
||||
private final PeekableIterator<SAMRecord> iterator;
|
||||
private final DownsamplingMethod downsamplingMethod;
|
||||
private final SamplePartitioner samplePartitioner;
|
||||
private final Map<String, PerSampleReadStateManager> readStatesBySample = new HashMap<String, PerSampleReadStateManager>();
|
||||
private final int targetCoverage;
|
||||
private int totalReadStates = 0;
|
||||
|
||||
public ReadStateManager(Iterator<SAMRecord> source) {
|
||||
public ReadStateManager(Iterator<SAMRecord> source, DownsamplingMethod downsamplingMethod) {
|
||||
this.iterator = new PeekableIterator<SAMRecord>(source);
|
||||
|
||||
for (final String sample : samples) {
|
||||
readStatesBySample.put(sample, new PerSampleReadStateManager());
|
||||
this.downsamplingMethod = downsamplingMethod.type != null ? downsamplingMethod : DownsamplingMethod.NONE;
|
||||
switch (this.downsamplingMethod.type) {
|
||||
case BY_SAMPLE:
|
||||
if (downsamplingMethod.toCoverage == null)
|
||||
throw new UserException.BadArgumentValue("dcov", "Downsampling coverage (-dcov) must be specified when downsampling by sample");
|
||||
this.targetCoverage = downsamplingMethod.toCoverage;
|
||||
break;
|
||||
default:
|
||||
this.targetCoverage = Integer.MAX_VALUE;
|
||||
}
|
||||
|
||||
samplePartitioner = new SamplePartitioner();
|
||||
Map<String, ReadSelector> readSelectors = new HashMap<String, ReadSelector>();
|
||||
for (final String sample : samples) {
|
||||
readStatesBySample.put(sample, new PerSampleReadStateManager());
|
||||
readSelectors.put(sample, downsamplingMethod.type == DownsampleType.BY_SAMPLE ? new NRandomReadSelector(null, targetCoverage) : new AllReadsSelector());
|
||||
}
|
||||
|
||||
samplePartitioner = new SamplePartitioner(readSelectors);
|
||||
}
|
||||
|
||||
/**
|
||||
|
|
@ -442,6 +449,7 @@ public class LocusIteratorByStateExperimental extends LocusIterator {
|
|||
|
||||
public void remove() {
|
||||
wrappedIterator.remove();
|
||||
totalReadStates--;
|
||||
}
|
||||
};
|
||||
}
|
||||
|
|
@ -469,6 +477,17 @@ public class LocusIteratorByStateExperimental extends LocusIterator {
|
|||
return readStatesBySample.get(sample).size();
|
||||
}
|
||||
|
||||
/**
|
||||
* The extent of downsampling; basically, the furthest base out which has 'fallen
|
||||
* victim' to the downsampler.
|
||||
*
|
||||
* @param sample Sample, downsampled independently.
|
||||
* @return Integer stop of the furthest undownsampled region.
|
||||
*/
|
||||
public int getDownsamplingExtent(final String sample) {
|
||||
return readStatesBySample.get(sample).getDownsamplingExtent();
|
||||
}
|
||||
|
||||
public SAMRecordState getFirst() {
|
||||
for (final String sample : samples) {
|
||||
PerSampleReadStateManager reads = readStatesBySample.get(sample);
|
||||
|
|
@ -501,13 +520,61 @@ public class LocusIteratorByStateExperimental extends LocusIterator {
|
|||
samplePartitioner.submitRead(iterator.next());
|
||||
}
|
||||
}
|
||||
samplePartitioner.complete();
|
||||
|
||||
for (final String sample : samples) {
|
||||
Collection<SAMRecord> newReads = samplePartitioner.getReadsForSample(sample);
|
||||
PerSampleReadStateManager statesBySample = readStatesBySample.get(sample);
|
||||
addReadsToSample(statesBySample, newReads);
|
||||
}
|
||||
ReadSelector aggregator = samplePartitioner.getSelectedReads(sample);
|
||||
|
||||
Collection<SAMRecord> newReads = new ArrayList<SAMRecord>(aggregator.getSelectedReads());
|
||||
|
||||
PerSampleReadStateManager statesBySample = readStatesBySample.get(sample);
|
||||
int numReads = statesBySample.size();
|
||||
int downsamplingExtent = aggregator.getDownsamplingExtent();
|
||||
|
||||
if (numReads + newReads.size() <= targetCoverage || downsamplingMethod.type == DownsampleType.NONE) {
|
||||
long readLimit = aggregator.getNumReadsSeen();
|
||||
addReadsToSample(statesBySample, newReads, readLimit);
|
||||
statesBySample.specifyNewDownsamplingExtent(downsamplingExtent);
|
||||
} else {
|
||||
int[] counts = statesBySample.getCountsPerAlignmentStart();
|
||||
int[] updatedCounts = new int[counts.length];
|
||||
System.arraycopy(counts, 0, updatedCounts, 0, counts.length);
|
||||
|
||||
boolean readPruned = true;
|
||||
while (numReads + newReads.size() > targetCoverage && readPruned) {
|
||||
readPruned = false;
|
||||
for (int alignmentStart = updatedCounts.length - 1; numReads + newReads.size() > targetCoverage && alignmentStart >= 0; alignmentStart--) {
|
||||
if (updatedCounts[alignmentStart] > 1) {
|
||||
updatedCounts[alignmentStart]--;
|
||||
numReads--;
|
||||
readPruned = true;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if (numReads == targetCoverage) {
|
||||
updatedCounts[0]--;
|
||||
numReads--;
|
||||
}
|
||||
|
||||
BitSet toPurge = new BitSet(readStates.size());
|
||||
int readOffset = 0;
|
||||
|
||||
for (int i = 0; i < updatedCounts.length; i++) {
|
||||
int n = counts[i];
|
||||
int k = updatedCounts[i];
|
||||
|
||||
for (Integer purgedElement : MathUtils.sampleIndicesWithoutReplacement(n, n - k))
|
||||
toPurge.set(readOffset + purgedElement);
|
||||
|
||||
readOffset += counts[i];
|
||||
}
|
||||
downsamplingExtent = Math.max(downsamplingExtent, statesBySample.purge(toPurge));
|
||||
|
||||
addReadsToSample(statesBySample, newReads, targetCoverage - numReads);
|
||||
statesBySample.specifyNewDownsamplingExtent(downsamplingExtent);
|
||||
}
|
||||
}
|
||||
samplePartitioner.reset();
|
||||
}
|
||||
|
||||
|
|
@ -516,134 +583,380 @@ public class LocusIteratorByStateExperimental extends LocusIterator {
|
|||
*
|
||||
* @param readStates The list of read states to add this collection of reads.
|
||||
* @param reads Reads to add. Selected reads will be pulled from this source.
|
||||
* @param maxReads Maximum number of reads to add.
|
||||
*/
|
||||
private void addReadsToSample(final PerSampleReadStateManager readStates, final Collection<SAMRecord> reads) {
|
||||
private void addReadsToSample(final PerSampleReadStateManager readStates, final Collection<SAMRecord> reads, final long maxReads) {
|
||||
if (reads.isEmpty())
|
||||
return;
|
||||
|
||||
Collection<SAMRecordState> newReadStates = new LinkedList<SAMRecordState>();
|
||||
|
||||
int readCount = 0;
|
||||
for (SAMRecord read : reads) {
|
||||
SAMRecordState state = new SAMRecordState(read);
|
||||
state.stepForwardOnGenome();
|
||||
newReadStates.add(state);
|
||||
if (readCount < maxReads) {
|
||||
SAMRecordState state = new SAMRecordState(read);
|
||||
state.stepForwardOnGenome();
|
||||
newReadStates.add(state);
|
||||
readCount++;
|
||||
}
|
||||
}
|
||||
|
||||
readStates.addStatesAtNextAlignmentStart(newReadStates);
|
||||
}
|
||||
|
||||
protected class PerSampleReadStateManager implements Iterable<SAMRecordState> {
|
||||
private List<LinkedList<SAMRecordState>> readStatesByAlignmentStart = new LinkedList<LinkedList<SAMRecordState>>();
|
||||
private int thisSampleReadStates = 0;
|
||||
private Downsampler<LinkedList<SAMRecordState>> levelingDownsampler =
|
||||
performLevelingDownsampling ?
|
||||
new LevelingDownsampler<LinkedList<SAMRecordState>, SAMRecordState>(readInfo.getDownsamplingMethod().toCoverage) :
|
||||
null;
|
||||
private class PerSampleReadStateManager implements Iterable<SAMRecordState> {
|
||||
private final Queue<SAMRecordState> readStates = new LinkedList<SAMRecordState>();
|
||||
private final Deque<Counter> readStateCounter = new LinkedList<Counter>();
|
||||
private int downsamplingExtent = 0;
|
||||
|
||||
public void addStatesAtNextAlignmentStart(Collection<SAMRecordState> states) {
|
||||
if ( states.isEmpty() ) {
|
||||
return;
|
||||
}
|
||||
|
||||
readStatesByAlignmentStart.add(new LinkedList<SAMRecordState>(states));
|
||||
thisSampleReadStates += states.size();
|
||||
readStates.addAll(states);
|
||||
readStateCounter.add(new Counter(states.size()));
|
||||
totalReadStates += states.size();
|
||||
|
||||
if ( levelingDownsampler != null ) {
|
||||
levelingDownsampler.submit(readStatesByAlignmentStart);
|
||||
levelingDownsampler.signalEndOfInput();
|
||||
|
||||
thisSampleReadStates -= levelingDownsampler.getNumberOfDiscardedItems();
|
||||
totalReadStates -= levelingDownsampler.getNumberOfDiscardedItems();
|
||||
|
||||
// use returned List directly rather than make a copy, for efficiency's sake
|
||||
readStatesByAlignmentStart = levelingDownsampler.consumeFinalizedItems();
|
||||
levelingDownsampler.reset();
|
||||
}
|
||||
}
|
||||
|
||||
public boolean isEmpty() {
|
||||
return readStatesByAlignmentStart.isEmpty();
|
||||
return readStates.isEmpty();
|
||||
}
|
||||
|
||||
public SAMRecordState peek() {
|
||||
return isEmpty() ? null : readStatesByAlignmentStart.get(0).peek();
|
||||
return readStates.peek();
|
||||
}
|
||||
|
||||
public int size() {
|
||||
return thisSampleReadStates;
|
||||
return readStates.size();
|
||||
}
|
||||
|
||||
public void specifyNewDownsamplingExtent(int downsamplingExtent) {
|
||||
this.downsamplingExtent = Math.max(this.downsamplingExtent, downsamplingExtent);
|
||||
}
|
||||
|
||||
public int getDownsamplingExtent() {
|
||||
return downsamplingExtent;
|
||||
}
|
||||
|
||||
public int[] getCountsPerAlignmentStart() {
|
||||
int[] counts = new int[readStateCounter.size()];
|
||||
int index = 0;
|
||||
for (Counter counter : readStateCounter)
|
||||
counts[index++] = counter.getCount();
|
||||
return counts;
|
||||
}
|
||||
|
||||
public Iterator<SAMRecordState> iterator() {
|
||||
return new Iterator<SAMRecordState>() {
|
||||
private Iterator<LinkedList<SAMRecordState>> alignmentStartIterator = readStatesByAlignmentStart.iterator();
|
||||
private LinkedList<SAMRecordState> currentPositionReadStates = null;
|
||||
private Iterator<SAMRecordState> currentPositionReadStatesIterator = null;
|
||||
private Iterator<SAMRecordState> wrappedIterator = readStates.iterator();
|
||||
|
||||
public boolean hasNext() {
|
||||
return alignmentStartIterator.hasNext() ||
|
||||
(currentPositionReadStatesIterator != null && currentPositionReadStatesIterator.hasNext());
|
||||
return wrappedIterator.hasNext();
|
||||
}
|
||||
|
||||
public SAMRecordState next() {
|
||||
if ( currentPositionReadStatesIterator == null || ! currentPositionReadStatesIterator.hasNext() ) {
|
||||
currentPositionReadStates = alignmentStartIterator.next();
|
||||
currentPositionReadStatesIterator = currentPositionReadStates.iterator();
|
||||
}
|
||||
|
||||
return currentPositionReadStatesIterator.next();
|
||||
return wrappedIterator.next();
|
||||
}
|
||||
|
||||
public void remove() {
|
||||
currentPositionReadStatesIterator.remove();
|
||||
thisSampleReadStates--;
|
||||
totalReadStates--;
|
||||
|
||||
if ( currentPositionReadStates.isEmpty() ) {
|
||||
alignmentStartIterator.remove();
|
||||
}
|
||||
wrappedIterator.remove();
|
||||
Counter counter = readStateCounter.peek();
|
||||
counter.decrement();
|
||||
if (counter.getCount() == 0)
|
||||
readStateCounter.remove();
|
||||
}
|
||||
};
|
||||
}
|
||||
|
||||
/**
|
||||
* Purge the given elements from the bitset. If an element in the bitset is true, purge
|
||||
* the corresponding read state.
|
||||
*
|
||||
* @param elements bits from the set to purge.
|
||||
* @return the extent of the final downsampled read.
|
||||
*/
|
||||
public int purge(final BitSet elements) {
|
||||
int downsamplingExtent = 0;
|
||||
|
||||
if (elements.isEmpty() || readStates.isEmpty()) return downsamplingExtent;
|
||||
|
||||
Iterator<SAMRecordState> readStateIterator = readStates.iterator();
|
||||
|
||||
Iterator<Counter> counterIterator = readStateCounter.iterator();
|
||||
Counter currentCounter = counterIterator.next();
|
||||
|
||||
int readIndex = 0;
|
||||
long alignmentStartCounter = currentCounter.getCount();
|
||||
|
||||
int toPurge = elements.nextSetBit(0);
|
||||
int removedCount = 0;
|
||||
|
||||
while (readStateIterator.hasNext() && toPurge >= 0) {
|
||||
SAMRecordState state = readStateIterator.next();
|
||||
downsamplingExtent = Math.max(downsamplingExtent, state.getRead().getAlignmentEnd());
|
||||
|
||||
if (readIndex == toPurge) {
|
||||
readStateIterator.remove();
|
||||
currentCounter.decrement();
|
||||
if (currentCounter.getCount() == 0)
|
||||
counterIterator.remove();
|
||||
removedCount++;
|
||||
toPurge = elements.nextSetBit(toPurge + 1);
|
||||
}
|
||||
|
||||
readIndex++;
|
||||
alignmentStartCounter--;
|
||||
if (alignmentStartCounter == 0 && counterIterator.hasNext()) {
|
||||
currentCounter = counterIterator.next();
|
||||
alignmentStartCounter = currentCounter.getCount();
|
||||
}
|
||||
}
|
||||
|
||||
totalReadStates -= removedCount;
|
||||
|
||||
return downsamplingExtent;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Note: stores reads by sample ID string, not by sample object
|
||||
* Note: assuming that, whenever we downsample, we downsample to an integer capacity.
|
||||
*/
|
||||
private class SamplePartitioner {
|
||||
private Map<String, Collection<SAMRecord>> readsBySample;
|
||||
private long readsSeen = 0;
|
||||
static private class Counter {
|
||||
private int count;
|
||||
|
||||
public SamplePartitioner() {
|
||||
readsBySample = new HashMap<String, Collection<SAMRecord>>();
|
||||
|
||||
for ( String sample : samples ) {
|
||||
readsBySample.put(sample, new ArrayList<SAMRecord>());
|
||||
}
|
||||
public Counter(int count) {
|
||||
this.count = count;
|
||||
}
|
||||
|
||||
public void submitRead(SAMRecord read) {
|
||||
String sampleName = read.getReadGroup() != null ? read.getReadGroup().getSample() : null;
|
||||
if (readsBySample.containsKey(sampleName))
|
||||
readsBySample.get(sampleName).add(read);
|
||||
readsSeen++;
|
||||
public int getCount() {
|
||||
return count;
|
||||
}
|
||||
|
||||
public long getNumReadsSeen() {
|
||||
return readsSeen;
|
||||
}
|
||||
|
||||
public Collection<SAMRecord> getReadsForSample(String sampleName) {
|
||||
if ( ! readsBySample.containsKey(sampleName) )
|
||||
throw new NoSuchElementException("Sample name not found");
|
||||
return readsBySample.get(sampleName);
|
||||
}
|
||||
|
||||
public void reset() {
|
||||
for ( Collection<SAMRecord> perSampleReads : readsBySample.values() )
|
||||
perSampleReads.clear();
|
||||
readsSeen = 0;
|
||||
public void decrement() {
|
||||
count--;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Selects reads passed to it based on a criteria decided through inheritance.
|
||||
* TODO: This is a temporary abstraction until we can get rid of this downsampling implementation and the mrl option. Get rid of this.
|
||||
*/
|
||||
interface ReadSelector {
|
||||
/**
|
||||
* All previous selectors in the chain have allowed this read. Submit it to this selector for consideration.
|
||||
*
|
||||
* @param read the read to evaluate.
|
||||
*/
|
||||
public void submitRead(SAMRecord read);
|
||||
|
||||
/**
|
||||
* A previous selector has deemed this read unfit. Notify this selector so that this selector's counts are valid.
|
||||
*
|
||||
* @param read the read previously rejected.
|
||||
*/
|
||||
public void notifyReadRejected(SAMRecord read);
|
||||
|
||||
/**
|
||||
* Signal the selector that read additions are complete.
|
||||
*/
|
||||
public void complete();
|
||||
|
||||
/**
|
||||
* Retrieve the number of reads seen by this selector so far.
|
||||
*
|
||||
* @return number of reads seen.
|
||||
*/
|
||||
public long getNumReadsSeen();
|
||||
|
||||
/**
|
||||
* Return the number of reads accepted by this selector so far.
|
||||
*
|
||||
* @return number of reads selected.
|
||||
*/
|
||||
public long getNumReadsSelected();
|
||||
|
||||
/**
|
||||
* Gets the locus at which the last of the downsampled reads selected by this selector ends. The value returned will be the
|
||||
* last aligned position from this selection to which a downsampled read aligns -- in other words, if a read is thrown out at
|
||||
* position 3 whose cigar string is 76M, the value of this parameter will be 78.
|
||||
*
|
||||
* @return If any read has been downsampled, this will return the last aligned base of the longest alignment. Else, 0.
|
||||
*/
|
||||
public int getDownsamplingExtent();
|
||||
|
||||
/**
|
||||
* Get the reads selected by this selector.
|
||||
*
|
||||
* @return collection of reads selected by this selector.
|
||||
*/
|
||||
public Collection<SAMRecord> getSelectedReads();
|
||||
|
||||
/**
|
||||
* Reset this collection to its pre-gathered state.
|
||||
*/
|
||||
public void reset();
|
||||
}
|
||||
|
||||
/**
|
||||
* Select every read passed in.
|
||||
*/
|
||||
class AllReadsSelector implements ReadSelector {
|
||||
private Collection<SAMRecord> reads = new LinkedList<SAMRecord>();
|
||||
private long readsSeen = 0;
|
||||
private int downsamplingExtent = 0;
|
||||
|
||||
public void submitRead(SAMRecord read) {
|
||||
reads.add(read);
|
||||
readsSeen++;
|
||||
}
|
||||
|
||||
public void notifyReadRejected(SAMRecord read) {
|
||||
readsSeen++;
|
||||
downsamplingExtent = Math.max(downsamplingExtent, read.getAlignmentEnd());
|
||||
}
|
||||
|
||||
public void complete() {
|
||||
// NO-OP.
|
||||
}
|
||||
|
||||
public long getNumReadsSeen() {
|
||||
return readsSeen;
|
||||
}
|
||||
|
||||
public long getNumReadsSelected() {
|
||||
return readsSeen;
|
||||
}
|
||||
|
||||
public int getDownsamplingExtent() {
|
||||
return downsamplingExtent;
|
||||
}
|
||||
|
||||
public Collection<SAMRecord> getSelectedReads() {
|
||||
return reads;
|
||||
}
|
||||
|
||||
public void reset() {
|
||||
reads.clear();
|
||||
readsSeen = 0;
|
||||
downsamplingExtent = 0;
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* Select N reads randomly from the input stream.
|
||||
*/
|
||||
class NRandomReadSelector implements ReadSelector {
|
||||
private final LegacyReservoirDownsampler<SAMRecord> reservoir;
|
||||
private final ReadSelector chainedSelector;
|
||||
private long readsSeen = 0;
|
||||
private int downsamplingExtent = 0;
|
||||
|
||||
public NRandomReadSelector(ReadSelector chainedSelector, long readLimit) {
|
||||
this.reservoir = new LegacyReservoirDownsampler<SAMRecord>((int) readLimit);
|
||||
this.chainedSelector = chainedSelector;
|
||||
}
|
||||
|
||||
public void submitRead(SAMRecord read) {
|
||||
SAMRecord displaced = reservoir.add(read);
|
||||
if (displaced != null && chainedSelector != null) {
|
||||
chainedSelector.notifyReadRejected(read);
|
||||
downsamplingExtent = Math.max(downsamplingExtent, read.getAlignmentEnd());
|
||||
}
|
||||
readsSeen++;
|
||||
}
|
||||
|
||||
public void notifyReadRejected(SAMRecord read) {
|
||||
readsSeen++;
|
||||
}
|
||||
|
||||
public void complete() {
|
||||
for (SAMRecord read : reservoir.getDownsampledContents())
|
||||
chainedSelector.submitRead(read);
|
||||
if (chainedSelector != null)
|
||||
chainedSelector.complete();
|
||||
}
|
||||
|
||||
|
||||
public long getNumReadsSeen() {
|
||||
return readsSeen;
|
||||
}
|
||||
|
||||
public long getNumReadsSelected() {
|
||||
return reservoir.size();
|
||||
}
|
||||
|
||||
public int getDownsamplingExtent() {
|
||||
return downsamplingExtent;
|
||||
}
|
||||
|
||||
public Collection<SAMRecord> getSelectedReads() {
|
||||
return reservoir.getDownsampledContents();
|
||||
}
|
||||
|
||||
public void reset() {
|
||||
reservoir.clear();
|
||||
downsamplingExtent = 0;
|
||||
if (chainedSelector != null)
|
||||
chainedSelector.reset();
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Note: stores reads by sample ID string, not by sample object
|
||||
*/
|
||||
class SamplePartitioner implements ReadSelector {
|
||||
private final Map<String, ReadSelector> readsBySample;
|
||||
private long readsSeen = 0;
|
||||
|
||||
public SamplePartitioner(Map<String, ReadSelector> readSelectors) {
|
||||
readsBySample = readSelectors;
|
||||
}
|
||||
|
||||
public void submitRead(SAMRecord read) {
|
||||
String sampleName = read.getReadGroup() != null ? read.getReadGroup().getSample() : null;
|
||||
if (readsBySample.containsKey(sampleName))
|
||||
readsBySample.get(sampleName).submitRead(read);
|
||||
readsSeen++;
|
||||
}
|
||||
|
||||
public void notifyReadRejected(SAMRecord read) {
|
||||
String sampleName = read.getReadGroup() != null ? read.getReadGroup().getSample() : null;
|
||||
if (readsBySample.containsKey(sampleName))
|
||||
readsBySample.get(sampleName).notifyReadRejected(read);
|
||||
readsSeen++;
|
||||
}
|
||||
|
||||
public void complete() {
|
||||
// NO-OP.
|
||||
}
|
||||
|
||||
public long getNumReadsSeen() {
|
||||
return readsSeen;
|
||||
}
|
||||
|
||||
public long getNumReadsSelected() {
|
||||
return readsSeen;
|
||||
}
|
||||
|
||||
public int getDownsamplingExtent() {
|
||||
int downsamplingExtent = 0;
|
||||
for (ReadSelector storage : readsBySample.values())
|
||||
downsamplingExtent = Math.max(downsamplingExtent, storage.getDownsamplingExtent());
|
||||
return downsamplingExtent;
|
||||
}
|
||||
|
||||
public Collection<SAMRecord> getSelectedReads() {
|
||||
throw new UnsupportedOperationException("Cannot directly get selected reads from a read partitioner.");
|
||||
}
|
||||
|
||||
public ReadSelector getSelectedReads(String sampleName) {
|
||||
if (!readsBySample.containsKey(sampleName))
|
||||
throw new NoSuchElementException("Sample name not found");
|
||||
return readsBySample.get(sampleName);
|
||||
}
|
||||
|
||||
public void reset() {
|
||||
for (ReadSelector storage : readsBySample.values())
|
||||
storage.reset();
|
||||
readsSeen = 0;
|
||||
}
|
||||
|
||||
}
|
||||
|
|
@ -31,14 +31,11 @@ import net.sf.samtools.CigarElement;
|
|||
import net.sf.samtools.CigarOperator;
|
||||
import net.sf.samtools.SAMRecord;
|
||||
import org.apache.log4j.Logger;
|
||||
import org.broadinstitute.sting.gatk.downsampling.DownsampleType;
|
||||
import org.broadinstitute.sting.gatk.downsampling.DownsamplingMethod;
|
||||
import org.broadinstitute.sting.gatk.ReadProperties;
|
||||
import org.broadinstitute.sting.gatk.contexts.AlignmentContext;
|
||||
import org.broadinstitute.sting.gatk.downsampling.*;
|
||||
import org.broadinstitute.sting.utils.GenomeLoc;
|
||||
import org.broadinstitute.sting.utils.GenomeLocParser;
|
||||
import org.broadinstitute.sting.utils.MathUtils;
|
||||
import org.broadinstitute.sting.utils.ReservoirDownsampler;
|
||||
import org.broadinstitute.sting.utils.exceptions.UserException;
|
||||
import org.broadinstitute.sting.utils.pileup.PileupElement;
|
||||
import org.broadinstitute.sting.utils.pileup.ReadBackedPileupImpl;
|
||||
|
|
@ -54,7 +51,7 @@ public class LocusIteratorByState extends LocusIterator {
|
|||
/**
|
||||
* our log, which we want to capture anything from this class
|
||||
*/
|
||||
private static Logger logger = Logger.getLogger(LocusIteratorByState.class);
|
||||
private static Logger logger = Logger.getLogger(LegacyLocusIteratorByState.class);
|
||||
|
||||
// -----------------------------------------------------------------------------------------------------------------
|
||||
//
|
||||
|
|
@ -69,7 +66,7 @@ public class LocusIteratorByState extends LocusIterator {
|
|||
private final ArrayList<String> samples;
|
||||
private final ReadStateManager readStates;
|
||||
|
||||
static private class SAMRecordState {
|
||||
protected static class SAMRecordState {
|
||||
SAMRecord read;
|
||||
int readOffset = -1; // how far are we offset from the start of the read bases?
|
||||
int genomeOffset = -1; // how far are we offset from the alignment start on the genome?
|
||||
|
|
@ -213,6 +210,7 @@ public class LocusIteratorByState extends LocusIterator {
|
|||
//final boolean DEBUG2 = false && DEBUG;
|
||||
private ReadProperties readInfo;
|
||||
private AlignmentContext nextAlignmentContext;
|
||||
private boolean performDownsampling;
|
||||
|
||||
// -----------------------------------------------------------------------------------------------------------------
|
||||
//
|
||||
|
|
@ -224,7 +222,18 @@ public class LocusIteratorByState extends LocusIterator {
|
|||
this.readInfo = readInformation;
|
||||
this.genomeLocParser = genomeLocParser;
|
||||
this.samples = new ArrayList<String>(samples);
|
||||
this.readStates = new ReadStateManager(samIterator, readInformation.getDownsamplingMethod());
|
||||
|
||||
// LIBS will invoke the Reservoir and Leveling downsamplers on the read stream if we're
|
||||
// downsampling to coverage by sample. SAMDataSource will have refrained from applying
|
||||
// any downsamplers to the read stream in this case, in the expectation that LIBS will
|
||||
// manage the downsampling. The reason for this is twofold: performance (don't have to
|
||||
// split/re-assemble the read stream in SAMDataSource), and to enable partial downsampling
|
||||
// of reads (eg., using half of a read, and throwing the rest away).
|
||||
this.performDownsampling = readInfo.getDownsamplingMethod() != null &&
|
||||
readInfo.getDownsamplingMethod().type == DownsampleType.BY_SAMPLE &&
|
||||
readInfo.getDownsamplingMethod().toCoverage != null;
|
||||
|
||||
this.readStates = new ReadStateManager(samIterator);
|
||||
|
||||
// currently the GATK expects this LocusIteratorByState to accept empty sample lists, when
|
||||
// there's no read data. So we need to throw this error only when samIterator.hasNext() is true
|
||||
|
|
@ -285,11 +294,13 @@ public class LocusIteratorByState extends LocusIterator {
|
|||
|
||||
final GenomeLoc location = getLocation();
|
||||
final Map<String, ReadBackedPileupImpl> fullPileup = new HashMap<String, ReadBackedPileupImpl>();
|
||||
|
||||
// TODO: How can you determine here whether the current pileup has been downsampled?
|
||||
boolean hasBeenSampled = false;
|
||||
|
||||
for (final String sample : samples) {
|
||||
final Iterator<SAMRecordState> iterator = readStates.iterator(sample);
|
||||
final List<PileupElement> pile = new ArrayList<PileupElement>(readStates.size(sample));
|
||||
hasBeenSampled |= location.getStart() <= readStates.getDownsamplingExtent(sample);
|
||||
|
||||
int size = 0; // number of elements in this sample's pileup
|
||||
int nDeletions = 0; // number of deletions in this sample's pileup
|
||||
|
|
@ -398,34 +409,20 @@ public class LocusIteratorByState extends LocusIterator {
|
|||
throw new UnsupportedOperationException("Can not remove records from a SAM file via an iterator!");
|
||||
}
|
||||
|
||||
private class ReadStateManager {
|
||||
protected class ReadStateManager {
|
||||
private final PeekableIterator<SAMRecord> iterator;
|
||||
private final DownsamplingMethod downsamplingMethod;
|
||||
private final SamplePartitioner samplePartitioner;
|
||||
private final Map<String, PerSampleReadStateManager> readStatesBySample = new HashMap<String, PerSampleReadStateManager>();
|
||||
private final int targetCoverage;
|
||||
private int totalReadStates = 0;
|
||||
|
||||
public ReadStateManager(Iterator<SAMRecord> source, DownsamplingMethod downsamplingMethod) {
|
||||
public ReadStateManager(Iterator<SAMRecord> source) {
|
||||
this.iterator = new PeekableIterator<SAMRecord>(source);
|
||||
this.downsamplingMethod = downsamplingMethod.type != null ? downsamplingMethod : DownsamplingMethod.NONE;
|
||||
switch (this.downsamplingMethod.type) {
|
||||
case BY_SAMPLE:
|
||||
if (downsamplingMethod.toCoverage == null)
|
||||
throw new UserException.BadArgumentValue("dcov", "Downsampling coverage (-dcov) must be specified when downsampling by sample");
|
||||
this.targetCoverage = downsamplingMethod.toCoverage;
|
||||
break;
|
||||
default:
|
||||
this.targetCoverage = Integer.MAX_VALUE;
|
||||
}
|
||||
|
||||
Map<String, ReadSelector> readSelectors = new HashMap<String, ReadSelector>();
|
||||
for (final String sample : samples) {
|
||||
readStatesBySample.put(sample, new PerSampleReadStateManager());
|
||||
readSelectors.put(sample, downsamplingMethod.type == DownsampleType.BY_SAMPLE ? new NRandomReadSelector(null, targetCoverage) : new AllReadsSelector());
|
||||
}
|
||||
|
||||
samplePartitioner = new SamplePartitioner(readSelectors);
|
||||
samplePartitioner = new SamplePartitioner(performDownsampling);
|
||||
}
|
||||
|
||||
/**
|
||||
|
|
@ -449,7 +446,6 @@ public class LocusIteratorByState extends LocusIterator {
|
|||
|
||||
public void remove() {
|
||||
wrappedIterator.remove();
|
||||
totalReadStates--;
|
||||
}
|
||||
};
|
||||
}
|
||||
|
|
@ -477,17 +473,6 @@ public class LocusIteratorByState extends LocusIterator {
|
|||
return readStatesBySample.get(sample).size();
|
||||
}
|
||||
|
||||
/**
|
||||
* The extent of downsampling; basically, the furthest base out which has 'fallen
|
||||
* victim' to the downsampler.
|
||||
*
|
||||
* @param sample Sample, downsampled independently.
|
||||
* @return Integer stop of the furthest undownsampled region.
|
||||
*/
|
||||
public int getDownsamplingExtent(final String sample) {
|
||||
return readStatesBySample.get(sample).getDownsamplingExtent();
|
||||
}
|
||||
|
||||
public SAMRecordState getFirst() {
|
||||
for (final String sample : samples) {
|
||||
PerSampleReadStateManager reads = readStatesBySample.get(sample);
|
||||
|
|
@ -520,61 +505,15 @@ public class LocusIteratorByState extends LocusIterator {
|
|||
samplePartitioner.submitRead(iterator.next());
|
||||
}
|
||||
}
|
||||
samplePartitioner.complete();
|
||||
|
||||
samplePartitioner.doneSubmittingReads();
|
||||
|
||||
for (final String sample : samples) {
|
||||
ReadSelector aggregator = samplePartitioner.getSelectedReads(sample);
|
||||
|
||||
Collection<SAMRecord> newReads = new ArrayList<SAMRecord>(aggregator.getSelectedReads());
|
||||
|
||||
Collection<SAMRecord> newReads = samplePartitioner.getReadsForSample(sample);
|
||||
PerSampleReadStateManager statesBySample = readStatesBySample.get(sample);
|
||||
int numReads = statesBySample.size();
|
||||
int downsamplingExtent = aggregator.getDownsamplingExtent();
|
||||
|
||||
if (numReads + newReads.size() <= targetCoverage || downsamplingMethod.type == DownsampleType.NONE) {
|
||||
long readLimit = aggregator.getNumReadsSeen();
|
||||
addReadsToSample(statesBySample, newReads, readLimit);
|
||||
statesBySample.specifyNewDownsamplingExtent(downsamplingExtent);
|
||||
} else {
|
||||
int[] counts = statesBySample.getCountsPerAlignmentStart();
|
||||
int[] updatedCounts = new int[counts.length];
|
||||
System.arraycopy(counts, 0, updatedCounts, 0, counts.length);
|
||||
|
||||
boolean readPruned = true;
|
||||
while (numReads + newReads.size() > targetCoverage && readPruned) {
|
||||
readPruned = false;
|
||||
for (int alignmentStart = updatedCounts.length - 1; numReads + newReads.size() > targetCoverage && alignmentStart >= 0; alignmentStart--) {
|
||||
if (updatedCounts[alignmentStart] > 1) {
|
||||
updatedCounts[alignmentStart]--;
|
||||
numReads--;
|
||||
readPruned = true;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if (numReads == targetCoverage) {
|
||||
updatedCounts[0]--;
|
||||
numReads--;
|
||||
}
|
||||
|
||||
BitSet toPurge = new BitSet(readStates.size());
|
||||
int readOffset = 0;
|
||||
|
||||
for (int i = 0; i < updatedCounts.length; i++) {
|
||||
int n = counts[i];
|
||||
int k = updatedCounts[i];
|
||||
|
||||
for (Integer purgedElement : MathUtils.sampleIndicesWithoutReplacement(n, n - k))
|
||||
toPurge.set(readOffset + purgedElement);
|
||||
|
||||
readOffset += counts[i];
|
||||
}
|
||||
downsamplingExtent = Math.max(downsamplingExtent, statesBySample.purge(toPurge));
|
||||
|
||||
addReadsToSample(statesBySample, newReads, targetCoverage - numReads);
|
||||
statesBySample.specifyNewDownsamplingExtent(downsamplingExtent);
|
||||
}
|
||||
addReadsToSample(statesBySample, newReads);
|
||||
}
|
||||
|
||||
samplePartitioner.reset();
|
||||
}
|
||||
|
||||
|
|
@ -583,380 +522,140 @@ public class LocusIteratorByState extends LocusIterator {
|
|||
*
|
||||
* @param readStates The list of read states to add this collection of reads.
|
||||
* @param reads Reads to add. Selected reads will be pulled from this source.
|
||||
* @param maxReads Maximum number of reads to add.
|
||||
*/
|
||||
private void addReadsToSample(final PerSampleReadStateManager readStates, final Collection<SAMRecord> reads, final long maxReads) {
|
||||
private void addReadsToSample(final PerSampleReadStateManager readStates, final Collection<SAMRecord> reads) {
|
||||
if (reads.isEmpty())
|
||||
return;
|
||||
|
||||
Collection<SAMRecordState> newReadStates = new LinkedList<SAMRecordState>();
|
||||
int readCount = 0;
|
||||
|
||||
for (SAMRecord read : reads) {
|
||||
if (readCount < maxReads) {
|
||||
SAMRecordState state = new SAMRecordState(read);
|
||||
state.stepForwardOnGenome();
|
||||
newReadStates.add(state);
|
||||
readCount++;
|
||||
}
|
||||
SAMRecordState state = new SAMRecordState(read);
|
||||
state.stepForwardOnGenome();
|
||||
newReadStates.add(state);
|
||||
}
|
||||
|
||||
readStates.addStatesAtNextAlignmentStart(newReadStates);
|
||||
}
|
||||
|
||||
private class PerSampleReadStateManager implements Iterable<SAMRecordState> {
|
||||
private final Queue<SAMRecordState> readStates = new LinkedList<SAMRecordState>();
|
||||
private final Deque<Counter> readStateCounter = new LinkedList<Counter>();
|
||||
private int downsamplingExtent = 0;
|
||||
protected class PerSampleReadStateManager implements Iterable<SAMRecordState> {
|
||||
private List<LinkedList<SAMRecordState>> readStatesByAlignmentStart = new LinkedList<LinkedList<SAMRecordState>>();
|
||||
private int thisSampleReadStates = 0;
|
||||
private Downsampler<LinkedList<SAMRecordState>> levelingDownsampler =
|
||||
performDownsampling ?
|
||||
new LevelingDownsampler<LinkedList<SAMRecordState>, SAMRecordState>(readInfo.getDownsamplingMethod().toCoverage) :
|
||||
null;
|
||||
|
||||
public void addStatesAtNextAlignmentStart(Collection<SAMRecordState> states) {
|
||||
readStates.addAll(states);
|
||||
readStateCounter.add(new Counter(states.size()));
|
||||
if ( states.isEmpty() ) {
|
||||
return;
|
||||
}
|
||||
|
||||
readStatesByAlignmentStart.add(new LinkedList<SAMRecordState>(states));
|
||||
thisSampleReadStates += states.size();
|
||||
totalReadStates += states.size();
|
||||
|
||||
if ( levelingDownsampler != null ) {
|
||||
levelingDownsampler.submit(readStatesByAlignmentStart);
|
||||
levelingDownsampler.signalEndOfInput();
|
||||
|
||||
thisSampleReadStates -= levelingDownsampler.getNumberOfDiscardedItems();
|
||||
totalReadStates -= levelingDownsampler.getNumberOfDiscardedItems();
|
||||
|
||||
// use returned List directly rather than make a copy, for efficiency's sake
|
||||
readStatesByAlignmentStart = levelingDownsampler.consumeFinalizedItems();
|
||||
levelingDownsampler.reset();
|
||||
}
|
||||
}
|
||||
|
||||
public boolean isEmpty() {
|
||||
return readStates.isEmpty();
|
||||
return readStatesByAlignmentStart.isEmpty();
|
||||
}
|
||||
|
||||
public SAMRecordState peek() {
|
||||
return readStates.peek();
|
||||
return isEmpty() ? null : readStatesByAlignmentStart.get(0).peek();
|
||||
}
|
||||
|
||||
public int size() {
|
||||
return readStates.size();
|
||||
}
|
||||
|
||||
public void specifyNewDownsamplingExtent(int downsamplingExtent) {
|
||||
this.downsamplingExtent = Math.max(this.downsamplingExtent, downsamplingExtent);
|
||||
}
|
||||
|
||||
public int getDownsamplingExtent() {
|
||||
return downsamplingExtent;
|
||||
}
|
||||
|
||||
public int[] getCountsPerAlignmentStart() {
|
||||
int[] counts = new int[readStateCounter.size()];
|
||||
int index = 0;
|
||||
for (Counter counter : readStateCounter)
|
||||
counts[index++] = counter.getCount();
|
||||
return counts;
|
||||
return thisSampleReadStates;
|
||||
}
|
||||
|
||||
public Iterator<SAMRecordState> iterator() {
|
||||
return new Iterator<SAMRecordState>() {
|
||||
private Iterator<SAMRecordState> wrappedIterator = readStates.iterator();
|
||||
private Iterator<LinkedList<SAMRecordState>> alignmentStartIterator = readStatesByAlignmentStart.iterator();
|
||||
private LinkedList<SAMRecordState> currentPositionReadStates = null;
|
||||
private Iterator<SAMRecordState> currentPositionReadStatesIterator = null;
|
||||
|
||||
public boolean hasNext() {
|
||||
return wrappedIterator.hasNext();
|
||||
return alignmentStartIterator.hasNext() ||
|
||||
(currentPositionReadStatesIterator != null && currentPositionReadStatesIterator.hasNext());
|
||||
}
|
||||
|
||||
public SAMRecordState next() {
|
||||
return wrappedIterator.next();
|
||||
if ( currentPositionReadStatesIterator == null || ! currentPositionReadStatesIterator.hasNext() ) {
|
||||
currentPositionReadStates = alignmentStartIterator.next();
|
||||
currentPositionReadStatesIterator = currentPositionReadStates.iterator();
|
||||
}
|
||||
|
||||
return currentPositionReadStatesIterator.next();
|
||||
}
|
||||
|
||||
public void remove() {
|
||||
wrappedIterator.remove();
|
||||
Counter counter = readStateCounter.peek();
|
||||
counter.decrement();
|
||||
if (counter.getCount() == 0)
|
||||
readStateCounter.remove();
|
||||
currentPositionReadStatesIterator.remove();
|
||||
thisSampleReadStates--;
|
||||
totalReadStates--;
|
||||
|
||||
if ( currentPositionReadStates.isEmpty() ) {
|
||||
alignmentStartIterator.remove();
|
||||
}
|
||||
}
|
||||
};
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Purge the given elements from the bitset. If an element in the bitset is true, purge
|
||||
* the corresponding read state.
|
||||
*
|
||||
* @param elements bits from the set to purge.
|
||||
* @return the extent of the final downsampled read.
|
||||
*/
|
||||
public int purge(final BitSet elements) {
|
||||
int downsamplingExtent = 0;
|
||||
/**
|
||||
* Divides reads by sample and (if requested) does a preliminary downsampling pass with a ReservoirDownsampler.
|
||||
*
|
||||
* Note: stores reads by sample ID string, not by sample object
|
||||
*/
|
||||
private class SamplePartitioner {
|
||||
private Map<String, Downsampler<SAMRecord>> readsBySample;
|
||||
|
||||
if (elements.isEmpty() || readStates.isEmpty()) return downsamplingExtent;
|
||||
public SamplePartitioner( boolean downsampleReads ) {
|
||||
readsBySample = new HashMap<String, Downsampler<SAMRecord>>();
|
||||
|
||||
Iterator<SAMRecordState> readStateIterator = readStates.iterator();
|
||||
for ( String sample : samples ) {
|
||||
readsBySample.put(sample,
|
||||
downsampleReads ? new ReservoirDownsampler<SAMRecord>(readInfo.getDownsamplingMethod().toCoverage) :
|
||||
new PassThroughDownsampler<SAMRecord>());
|
||||
}
|
||||
}
|
||||
|
||||
Iterator<Counter> counterIterator = readStateCounter.iterator();
|
||||
Counter currentCounter = counterIterator.next();
|
||||
public void submitRead(SAMRecord read) {
|
||||
String sampleName = read.getReadGroup() != null ? read.getReadGroup().getSample() : null;
|
||||
if (readsBySample.containsKey(sampleName))
|
||||
readsBySample.get(sampleName).submit(read);
|
||||
}
|
||||
|
||||
int readIndex = 0;
|
||||
long alignmentStartCounter = currentCounter.getCount();
|
||||
public void doneSubmittingReads() {
|
||||
for ( Map.Entry<String, Downsampler<SAMRecord>> perSampleReads : readsBySample.entrySet() ) {
|
||||
perSampleReads.getValue().signalEndOfInput();
|
||||
}
|
||||
}
|
||||
|
||||
int toPurge = elements.nextSetBit(0);
|
||||
int removedCount = 0;
|
||||
public Collection<SAMRecord> getReadsForSample(String sampleName) {
|
||||
if ( ! readsBySample.containsKey(sampleName) )
|
||||
throw new NoSuchElementException("Sample name not found");
|
||||
|
||||
while (readStateIterator.hasNext() && toPurge >= 0) {
|
||||
SAMRecordState state = readStateIterator.next();
|
||||
downsamplingExtent = Math.max(downsamplingExtent, state.getRead().getAlignmentEnd());
|
||||
return readsBySample.get(sampleName).consumeFinalizedItems();
|
||||
}
|
||||
|
||||
if (readIndex == toPurge) {
|
||||
readStateIterator.remove();
|
||||
currentCounter.decrement();
|
||||
if (currentCounter.getCount() == 0)
|
||||
counterIterator.remove();
|
||||
removedCount++;
|
||||
toPurge = elements.nextSetBit(toPurge + 1);
|
||||
}
|
||||
|
||||
readIndex++;
|
||||
alignmentStartCounter--;
|
||||
if (alignmentStartCounter == 0 && counterIterator.hasNext()) {
|
||||
currentCounter = counterIterator.next();
|
||||
alignmentStartCounter = currentCounter.getCount();
|
||||
}
|
||||
}
|
||||
|
||||
totalReadStates -= removedCount;
|
||||
|
||||
return downsamplingExtent;
|
||||
public void reset() {
|
||||
for ( Map.Entry<String, Downsampler<SAMRecord>> perSampleReads : readsBySample.entrySet() ) {
|
||||
perSampleReads.getValue().clear();
|
||||
perSampleReads.getValue().reset();
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Note: assuming that, whenever we downsample, we downsample to an integer capacity.
|
||||
*/
|
||||
static private class Counter {
|
||||
private int count;
|
||||
|
||||
public Counter(int count) {
|
||||
this.count = count;
|
||||
}
|
||||
|
||||
public int getCount() {
|
||||
return count;
|
||||
}
|
||||
|
||||
public void decrement() {
|
||||
count--;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Selects reads passed to it based on a criteria decided through inheritance.
|
||||
* TODO: This is a temporary abstraction until we can get rid of this downsampling implementation and the mrl option. Get rid of this.
|
||||
*/
|
||||
interface ReadSelector {
|
||||
/**
|
||||
* All previous selectors in the chain have allowed this read. Submit it to this selector for consideration.
|
||||
*
|
||||
* @param read the read to evaluate.
|
||||
*/
|
||||
public void submitRead(SAMRecord read);
|
||||
|
||||
/**
|
||||
* A previous selector has deemed this read unfit. Notify this selector so that this selector's counts are valid.
|
||||
*
|
||||
* @param read the read previously rejected.
|
||||
*/
|
||||
public void notifyReadRejected(SAMRecord read);
|
||||
|
||||
/**
|
||||
* Signal the selector that read additions are complete.
|
||||
*/
|
||||
public void complete();
|
||||
|
||||
/**
|
||||
* Retrieve the number of reads seen by this selector so far.
|
||||
*
|
||||
* @return number of reads seen.
|
||||
*/
|
||||
public long getNumReadsSeen();
|
||||
|
||||
/**
|
||||
* Return the number of reads accepted by this selector so far.
|
||||
*
|
||||
* @return number of reads selected.
|
||||
*/
|
||||
public long getNumReadsSelected();
|
||||
|
||||
/**
|
||||
* Gets the locus at which the last of the downsampled reads selected by this selector ends. The value returned will be the
|
||||
* last aligned position from this selection to which a downsampled read aligns -- in other words, if a read is thrown out at
|
||||
* position 3 whose cigar string is 76M, the value of this parameter will be 78.
|
||||
*
|
||||
* @return If any read has been downsampled, this will return the last aligned base of the longest alignment. Else, 0.
|
||||
*/
|
||||
public int getDownsamplingExtent();
|
||||
|
||||
/**
|
||||
* Get the reads selected by this selector.
|
||||
*
|
||||
* @return collection of reads selected by this selector.
|
||||
*/
|
||||
public Collection<SAMRecord> getSelectedReads();
|
||||
|
||||
/**
|
||||
* Reset this collection to its pre-gathered state.
|
||||
*/
|
||||
public void reset();
|
||||
}
|
||||
|
||||
/**
|
||||
* Select every read passed in.
|
||||
*/
|
||||
class AllReadsSelector implements ReadSelector {
|
||||
private Collection<SAMRecord> reads = new LinkedList<SAMRecord>();
|
||||
private long readsSeen = 0;
|
||||
private int downsamplingExtent = 0;
|
||||
|
||||
public void submitRead(SAMRecord read) {
|
||||
reads.add(read);
|
||||
readsSeen++;
|
||||
}
|
||||
|
||||
public void notifyReadRejected(SAMRecord read) {
|
||||
readsSeen++;
|
||||
downsamplingExtent = Math.max(downsamplingExtent, read.getAlignmentEnd());
|
||||
}
|
||||
|
||||
public void complete() {
|
||||
// NO-OP.
|
||||
}
|
||||
|
||||
public long getNumReadsSeen() {
|
||||
return readsSeen;
|
||||
}
|
||||
|
||||
public long getNumReadsSelected() {
|
||||
return readsSeen;
|
||||
}
|
||||
|
||||
public int getDownsamplingExtent() {
|
||||
return downsamplingExtent;
|
||||
}
|
||||
|
||||
public Collection<SAMRecord> getSelectedReads() {
|
||||
return reads;
|
||||
}
|
||||
|
||||
public void reset() {
|
||||
reads.clear();
|
||||
readsSeen = 0;
|
||||
downsamplingExtent = 0;
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* Select N reads randomly from the input stream.
|
||||
*/
|
||||
class NRandomReadSelector implements ReadSelector {
|
||||
private final ReservoirDownsampler<SAMRecord> reservoir;
|
||||
private final ReadSelector chainedSelector;
|
||||
private long readsSeen = 0;
|
||||
private int downsamplingExtent = 0;
|
||||
|
||||
public NRandomReadSelector(ReadSelector chainedSelector, long readLimit) {
|
||||
this.reservoir = new ReservoirDownsampler<SAMRecord>((int) readLimit);
|
||||
this.chainedSelector = chainedSelector;
|
||||
}
|
||||
|
||||
public void submitRead(SAMRecord read) {
|
||||
SAMRecord displaced = reservoir.add(read);
|
||||
if (displaced != null && chainedSelector != null) {
|
||||
chainedSelector.notifyReadRejected(read);
|
||||
downsamplingExtent = Math.max(downsamplingExtent, read.getAlignmentEnd());
|
||||
}
|
||||
readsSeen++;
|
||||
}
|
||||
|
||||
public void notifyReadRejected(SAMRecord read) {
|
||||
readsSeen++;
|
||||
}
|
||||
|
||||
public void complete() {
|
||||
for (SAMRecord read : reservoir.getDownsampledContents())
|
||||
chainedSelector.submitRead(read);
|
||||
if (chainedSelector != null)
|
||||
chainedSelector.complete();
|
||||
}
|
||||
|
||||
|
||||
public long getNumReadsSeen() {
|
||||
return readsSeen;
|
||||
}
|
||||
|
||||
public long getNumReadsSelected() {
|
||||
return reservoir.size();
|
||||
}
|
||||
|
||||
public int getDownsamplingExtent() {
|
||||
return downsamplingExtent;
|
||||
}
|
||||
|
||||
public Collection<SAMRecord> getSelectedReads() {
|
||||
return reservoir.getDownsampledContents();
|
||||
}
|
||||
|
||||
public void reset() {
|
||||
reservoir.clear();
|
||||
downsamplingExtent = 0;
|
||||
if (chainedSelector != null)
|
||||
chainedSelector.reset();
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Note: stores reads by sample ID string, not by sample object
|
||||
*/
|
||||
class SamplePartitioner implements ReadSelector {
|
||||
private final Map<String, ReadSelector> readsBySample;
|
||||
private long readsSeen = 0;
|
||||
|
||||
public SamplePartitioner(Map<String, ReadSelector> readSelectors) {
|
||||
readsBySample = readSelectors;
|
||||
}
|
||||
|
||||
public void submitRead(SAMRecord read) {
|
||||
String sampleName = read.getReadGroup() != null ? read.getReadGroup().getSample() : null;
|
||||
if (readsBySample.containsKey(sampleName))
|
||||
readsBySample.get(sampleName).submitRead(read);
|
||||
readsSeen++;
|
||||
}
|
||||
|
||||
public void notifyReadRejected(SAMRecord read) {
|
||||
String sampleName = read.getReadGroup() != null ? read.getReadGroup().getSample() : null;
|
||||
if (readsBySample.containsKey(sampleName))
|
||||
readsBySample.get(sampleName).notifyReadRejected(read);
|
||||
readsSeen++;
|
||||
}
|
||||
|
||||
public void complete() {
|
||||
// NO-OP.
|
||||
}
|
||||
|
||||
public long getNumReadsSeen() {
|
||||
return readsSeen;
|
||||
}
|
||||
|
||||
public long getNumReadsSelected() {
|
||||
return readsSeen;
|
||||
}
|
||||
|
||||
public int getDownsamplingExtent() {
|
||||
int downsamplingExtent = 0;
|
||||
for (ReadSelector storage : readsBySample.values())
|
||||
downsamplingExtent = Math.max(downsamplingExtent, storage.getDownsamplingExtent());
|
||||
return downsamplingExtent;
|
||||
}
|
||||
|
||||
public Collection<SAMRecord> getSelectedReads() {
|
||||
throw new UnsupportedOperationException("Cannot directly get selected reads from a read partitioner.");
|
||||
}
|
||||
|
||||
public ReadSelector getSelectedReads(String sampleName) {
|
||||
if (!readsBySample.containsKey(sampleName))
|
||||
throw new NoSuchElementException("Sample name not found");
|
||||
return readsBySample.get(sampleName);
|
||||
}
|
||||
|
||||
public void reset() {
|
||||
for (ReadSelector storage : readsBySample.values())
|
||||
storage.reset();
|
||||
readsSeen = 0;
|
||||
}
|
||||
|
||||
}
|
||||
}
|
||||
|
|
@ -8,6 +8,8 @@ import java.util.Collection;
|
|||
import java.util.Iterator;
|
||||
|
||||
/**
|
||||
* THIS IMPLEMENTATION IS BROKEN AND WILL BE REMOVED ONCE THE DOWNSAMPLING ENGINE FORK COLLAPSES
|
||||
*
|
||||
* Randomly downsample from a stream of elements. This algorithm is a direct,
|
||||
* naive implementation of reservoir downsampling as described in "Random Downsampling
|
||||
* with a Reservoir" (Vitter 1985). At time of writing, this paper is located here:
|
||||
|
|
@ -16,7 +18,7 @@ import java.util.Iterator;
|
|||
* @author mhanna
|
||||
* @version 0.1
|
||||
*/
|
||||
public class ReservoirDownsampler<T> {
|
||||
public class LegacyReservoirDownsampler<T> {
|
||||
/**
|
||||
* The reservoir of elements tracked by this downsampler.
|
||||
*/
|
||||
|
|
@ -31,7 +33,7 @@ public class ReservoirDownsampler<T> {
|
|||
* Create a new downsampler with the given source iterator and given comparator.
|
||||
* @param maxElements What is the maximum number of reads that can be returned in any call of this
|
||||
*/
|
||||
public ReservoirDownsampler(final int maxElements) {
|
||||
public LegacyReservoirDownsampler(final int maxElements) {
|
||||
if(maxElements < 0)
|
||||
throw new ReviewedStingException("Unable to work with an negative size collection of elements");
|
||||
this.reservoir = new ArrayList<T>(maxElements);
|
||||
|
|
@ -32,11 +32,10 @@ import net.sf.samtools.SAMRecord;
|
|||
import org.broadinstitute.sting.commandline.Tags;
|
||||
import org.broadinstitute.sting.gatk.downsampling.DownsamplingMethod;
|
||||
import org.broadinstitute.sting.gatk.ReadProperties;
|
||||
import org.broadinstitute.sting.gatk.arguments.GATKArgumentCollection;
|
||||
import org.broadinstitute.sting.gatk.arguments.ValidationExclusion;
|
||||
import org.broadinstitute.sting.gatk.filters.ReadFilter;
|
||||
import org.broadinstitute.sting.gatk.filters.UnmappedReadFilter;
|
||||
import org.broadinstitute.sting.gatk.iterators.LocusIteratorByState;
|
||||
import org.broadinstitute.sting.gatk.iterators.LegacyLocusIteratorByState;
|
||||
import org.broadinstitute.sting.gatk.iterators.ReadTransformer;
|
||||
import org.broadinstitute.sting.gatk.walkers.qc.CountLoci;
|
||||
import org.broadinstitute.sting.utils.GenomeLocParser;
|
||||
|
|
@ -85,7 +84,7 @@ public class DownsamplerBenchmark extends ReadProcessingBenchmark {
|
|||
GenomeLocParser genomeLocParser = new GenomeLocParser(reader.getFileHeader().getSequenceDictionary());
|
||||
// Filter unmapped reads. TODO: is this always strictly necessary? Who in the GATK normally filters these out?
|
||||
Iterator<SAMRecord> readIterator = new FilteringIterator(reader.iterator(),new UnmappedReadFilter());
|
||||
LocusIteratorByState locusIteratorByState = new LocusIteratorByState(readIterator,readProperties,genomeLocParser, LocusIteratorByState.sampleListForSAMWithoutReadGroups());
|
||||
LegacyLocusIteratorByState locusIteratorByState = new LegacyLocusIteratorByState(readIterator,readProperties,genomeLocParser, LegacyLocusIteratorByState.sampleListForSAMWithoutReadGroups());
|
||||
while(locusIteratorByState.hasNext()) {
|
||||
locusIteratorByState.next().getLocation();
|
||||
}
|
||||
|
|
|
|||
|
|
@ -45,10 +45,10 @@ import java.io.IOException;
|
|||
import java.util.ArrayList;
|
||||
import java.util.Arrays;
|
||||
|
||||
public class ExperimentalReadShardBalancerUnitTest extends BaseTest {
|
||||
public class ReadShardBalancerUnitTest extends BaseTest {
|
||||
|
||||
/**
|
||||
* Tests to ensure that ExperimentalReadShardBalancer works as expected and does not place shard boundaries
|
||||
* Tests to ensure that ReadShardBalancer works as expected and does not place shard boundaries
|
||||
* at inappropriate places, such as within an alignment start position
|
||||
*/
|
||||
private static class ExperimentalReadShardBalancerTest extends TestDataProvider {
|
||||
|
|
@ -74,7 +74,7 @@ public class ExperimentalReadShardBalancerUnitTest extends BaseTest {
|
|||
this.stackSize = stackSize;
|
||||
this.numUnmappedReads = numUnmappedReads;
|
||||
|
||||
this.downsamplingMethod = new DownsamplingMethod(DownsampleType.BY_SAMPLE, downsamplingTargetCoverage, null, true);
|
||||
this.downsamplingMethod = new DownsamplingMethod(DownsampleType.BY_SAMPLE, downsamplingTargetCoverage, null, false);
|
||||
this.expectedReadCount = Math.min(stackSize, downsamplingTargetCoverage) * numStacksPerContig * numContigs + numUnmappedReads;
|
||||
|
||||
setName(String.format("%s: numContigs=%d numStacksPerContig=%d stackSize=%d numUnmappedReads=%d downsamplingTargetCoverage=%d",
|
||||
|
|
@ -96,7 +96,7 @@ public class ExperimentalReadShardBalancerUnitTest extends BaseTest {
|
|||
new ArrayList<ReadFilter>(),
|
||||
false);
|
||||
|
||||
Iterable<Shard> shardIterator = dataSource.createShardIteratorOverAllReads(new ExperimentalReadShardBalancer());
|
||||
Iterable<Shard> shardIterator = dataSource.createShardIteratorOverAllReads(new ReadShardBalancer());
|
||||
|
||||
SAMRecord readAtEndOfLastShard = null;
|
||||
int totalReadsSeen = 0;
|
||||
|
|
@ -7,10 +7,8 @@ import org.broadinstitute.sting.gatk.ReadProperties;
|
|||
import org.broadinstitute.sting.gatk.arguments.ValidationExclusion;
|
||||
import org.broadinstitute.sting.gatk.contexts.AlignmentContext;
|
||||
import org.broadinstitute.sting.gatk.datasources.reads.SAMReaderID;
|
||||
import org.broadinstitute.sting.gatk.downsampling.DownsamplingMethod;
|
||||
import org.broadinstitute.sting.gatk.filters.ReadFilter;
|
||||
import org.broadinstitute.sting.utils.GenomeLocParser;
|
||||
import org.broadinstitute.sting.utils.MathUtils;
|
||||
import org.broadinstitute.sting.utils.Utils;
|
||||
import org.broadinstitute.sting.utils.pileup.PileupElement;
|
||||
import org.broadinstitute.sting.utils.pileup.ReadBackedPileup;
|
||||
|
|
@ -21,14 +19,17 @@ import org.testng.annotations.BeforeClass;
|
|||
import org.testng.annotations.DataProvider;
|
||||
import org.testng.annotations.Test;
|
||||
|
||||
import java.util.*;
|
||||
import java.util.Arrays;
|
||||
import java.util.Collections;
|
||||
import java.util.Iterator;
|
||||
import java.util.List;
|
||||
|
||||
/**
|
||||
* testing of the experimental version of LocusIteratorByState
|
||||
* testing of the LEGACY version of LocusIteratorByState
|
||||
*/
|
||||
public class LocusIteratorByStateExperimentalUnitTest extends BaseTest {
|
||||
public class LegacyLocusIteratorByStateUnitTest extends BaseTest {
|
||||
private static SAMFileHeader header;
|
||||
private LocusIteratorByStateExperimental li;
|
||||
private LegacyLocusIteratorByState li;
|
||||
private GenomeLocParser genomeLocParser;
|
||||
|
||||
@BeforeClass
|
||||
|
|
@ -37,8 +38,8 @@ public class LocusIteratorByStateExperimentalUnitTest extends BaseTest {
|
|||
genomeLocParser = new GenomeLocParser(header.getSequenceDictionary());
|
||||
}
|
||||
|
||||
private LocusIteratorByStateExperimental makeLTBS(List<SAMRecord> reads, ReadProperties readAttributes) {
|
||||
return new LocusIteratorByStateExperimental(new FakeCloseableIterator<SAMRecord>(reads.iterator()), readAttributes, genomeLocParser, LocusIteratorByStateExperimental.sampleListForSAMWithoutReadGroups());
|
||||
private LegacyLocusIteratorByState makeLTBS(List<SAMRecord> reads, ReadProperties readAttributes) {
|
||||
return new LegacyLocusIteratorByState(new FakeCloseableIterator<SAMRecord>(reads.iterator()), readAttributes, genomeLocParser, LegacyLocusIteratorByState.sampleListForSAMWithoutReadGroups());
|
||||
}
|
||||
|
||||
@Test
|
||||
|
|
@ -328,184 +329,14 @@ public class LocusIteratorByStateExperimentalUnitTest extends BaseTest {
|
|||
// End comprehensive LIBS/PileupElement tests //
|
||||
////////////////////////////////////////////////
|
||||
|
||||
|
||||
///////////////////////////////////////
|
||||
// Read State Manager Tests //
|
||||
///////////////////////////////////////
|
||||
|
||||
private class PerSampleReadStateManagerTest extends TestDataProvider {
|
||||
private List<Integer> readCountsPerAlignmentStart;
|
||||
private List<SAMRecord> reads;
|
||||
private List<ArrayList<LocusIteratorByStateExperimental.SAMRecordState>> recordStatesByAlignmentStart;
|
||||
private int removalInterval;
|
||||
|
||||
public PerSampleReadStateManagerTest( List<Integer> readCountsPerAlignmentStart, int removalInterval ) {
|
||||
super(PerSampleReadStateManagerTest.class);
|
||||
|
||||
this.readCountsPerAlignmentStart = readCountsPerAlignmentStart;
|
||||
this.removalInterval = removalInterval;
|
||||
|
||||
reads = new ArrayList<SAMRecord>();
|
||||
recordStatesByAlignmentStart = new ArrayList<ArrayList<LocusIteratorByStateExperimental.SAMRecordState>>();
|
||||
|
||||
setName(String.format("%s: readCountsPerAlignmentStart: %s removalInterval: %d",
|
||||
getClass().getSimpleName(), readCountsPerAlignmentStart, removalInterval));
|
||||
}
|
||||
|
||||
public void run() {
|
||||
LocusIteratorByStateExperimental libs = makeLTBS(new ArrayList<SAMRecord>(), createTestReadProperties());
|
||||
LocusIteratorByStateExperimental.ReadStateManager readStateManager =
|
||||
libs.new ReadStateManager(new ArrayList<SAMRecord>().iterator());
|
||||
LocusIteratorByStateExperimental.ReadStateManager.PerSampleReadStateManager perSampleReadStateManager =
|
||||
readStateManager.new PerSampleReadStateManager();
|
||||
|
||||
makeReads();
|
||||
|
||||
for ( ArrayList<LocusIteratorByStateExperimental.SAMRecordState> stackRecordStates : recordStatesByAlignmentStart ) {
|
||||
perSampleReadStateManager.addStatesAtNextAlignmentStart(stackRecordStates);
|
||||
}
|
||||
|
||||
// read state manager should have the right number of reads
|
||||
Assert.assertEquals(reads.size(), perSampleReadStateManager.size());
|
||||
|
||||
Iterator<SAMRecord> originalReadsIterator = reads.iterator();
|
||||
Iterator<LocusIteratorByStateExperimental.SAMRecordState> recordStateIterator = perSampleReadStateManager.iterator();
|
||||
int recordStateCount = 0;
|
||||
int numReadStatesRemoved = 0;
|
||||
|
||||
// Do a first-pass validation of the record state iteration by making sure we get back everything we
|
||||
// put in, in the same order, doing any requested removals of read states along the way
|
||||
while ( recordStateIterator.hasNext() ) {
|
||||
LocusIteratorByStateExperimental.SAMRecordState readState = recordStateIterator.next();
|
||||
recordStateCount++;
|
||||
SAMRecord readFromPerSampleReadStateManager = readState.getRead();
|
||||
|
||||
Assert.assertTrue(originalReadsIterator.hasNext());
|
||||
SAMRecord originalRead = originalReadsIterator.next();
|
||||
|
||||
// The read we get back should be literally the same read in memory as we put in
|
||||
Assert.assertTrue(originalRead == readFromPerSampleReadStateManager);
|
||||
|
||||
// If requested, remove a read state every removalInterval states
|
||||
if ( removalInterval > 0 && recordStateCount % removalInterval == 0 ) {
|
||||
recordStateIterator.remove();
|
||||
numReadStatesRemoved++;
|
||||
}
|
||||
}
|
||||
|
||||
Assert.assertFalse(originalReadsIterator.hasNext());
|
||||
|
||||
// If we removed any read states, do a second pass through the read states to make sure the right
|
||||
// states were removed
|
||||
if ( numReadStatesRemoved > 0 ) {
|
||||
Assert.assertEquals(perSampleReadStateManager.size(), reads.size() - numReadStatesRemoved);
|
||||
|
||||
originalReadsIterator = reads.iterator();
|
||||
recordStateIterator = perSampleReadStateManager.iterator();
|
||||
int readCount = 0;
|
||||
int readStateCount = 0;
|
||||
|
||||
// Match record states with the reads that should remain after removal
|
||||
while ( recordStateIterator.hasNext() ) {
|
||||
LocusIteratorByStateExperimental.SAMRecordState readState = recordStateIterator.next();
|
||||
readStateCount++;
|
||||
SAMRecord readFromPerSampleReadStateManager = readState.getRead();
|
||||
|
||||
Assert.assertTrue(originalReadsIterator.hasNext());
|
||||
|
||||
SAMRecord originalRead = originalReadsIterator.next();
|
||||
readCount++;
|
||||
|
||||
if ( readCount % removalInterval == 0 ) {
|
||||
originalRead = originalReadsIterator.next(); // advance to next read, since the previous one should have been discarded
|
||||
readCount++;
|
||||
}
|
||||
|
||||
// The read we get back should be literally the same read in memory as we put in (after accounting for removals)
|
||||
Assert.assertTrue(originalRead == readFromPerSampleReadStateManager);
|
||||
}
|
||||
|
||||
Assert.assertEquals(readStateCount, reads.size() - numReadStatesRemoved);
|
||||
}
|
||||
|
||||
// Allow memory used by this test to be reclaimed
|
||||
readCountsPerAlignmentStart = null;
|
||||
reads = null;
|
||||
recordStatesByAlignmentStart = null;
|
||||
}
|
||||
|
||||
private void makeReads() {
|
||||
int alignmentStart = 1;
|
||||
|
||||
for ( int readsThisStack : readCountsPerAlignmentStart ) {
|
||||
ArrayList<SAMRecord> stackReads = new ArrayList<SAMRecord>(ArtificialSAMUtils.createStackOfIdenticalArtificialReads(readsThisStack, header, "foo", 0, alignmentStart, MathUtils.randomIntegerInRange(50, 100)));
|
||||
ArrayList<LocusIteratorByStateExperimental.SAMRecordState> stackRecordStates = new ArrayList<LocusIteratorByStateExperimental.SAMRecordState>();
|
||||
|
||||
for ( SAMRecord read : stackReads ) {
|
||||
stackRecordStates.add(new LocusIteratorByStateExperimental.SAMRecordState(read));
|
||||
}
|
||||
|
||||
reads.addAll(stackReads);
|
||||
recordStatesByAlignmentStart.add(stackRecordStates);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@DataProvider(name = "PerSampleReadStateManagerTestDataProvider")
|
||||
public Object[][] createPerSampleReadStateManagerTests() {
|
||||
for ( List<Integer> thisTestReadStateCounts : Arrays.asList( Arrays.asList(1),
|
||||
Arrays.asList(2),
|
||||
Arrays.asList(10),
|
||||
Arrays.asList(1, 1),
|
||||
Arrays.asList(2, 2),
|
||||
Arrays.asList(10, 10),
|
||||
Arrays.asList(1, 10),
|
||||
Arrays.asList(10, 1),
|
||||
Arrays.asList(1, 1, 1),
|
||||
Arrays.asList(2, 2, 2),
|
||||
Arrays.asList(10, 10, 10),
|
||||
Arrays.asList(1, 1, 1, 1, 1, 1),
|
||||
Arrays.asList(10, 10, 10, 10, 10, 10),
|
||||
Arrays.asList(1, 2, 10, 1, 2, 10)
|
||||
) ) {
|
||||
|
||||
for ( int removalInterval : Arrays.asList(0, 2, 3) ) {
|
||||
new PerSampleReadStateManagerTest(thisTestReadStateCounts, removalInterval);
|
||||
}
|
||||
}
|
||||
|
||||
return PerSampleReadStateManagerTest.getTests(PerSampleReadStateManagerTest.class);
|
||||
}
|
||||
|
||||
@Test(dataProvider = "PerSampleReadStateManagerTestDataProvider")
|
||||
public void runPerSampleReadStateManagerTest( PerSampleReadStateManagerTest test ) {
|
||||
logger.warn("Running test: " + test);
|
||||
|
||||
test.run();
|
||||
}
|
||||
|
||||
///////////////////////////////////////
|
||||
// End Read State Manager Tests //
|
||||
///////////////////////////////////////
|
||||
|
||||
|
||||
|
||||
///////////////////////////////////////
|
||||
// Helper methods / classes //
|
||||
///////////////////////////////////////
|
||||
|
||||
private static ReadProperties createTestReadProperties() {
|
||||
return createTestReadProperties(null);
|
||||
}
|
||||
|
||||
private static ReadProperties createTestReadProperties( DownsamplingMethod downsamplingMethod ) {
|
||||
return new ReadProperties(
|
||||
Collections.<SAMReaderID>emptyList(),
|
||||
new SAMFileHeader(),
|
||||
SAMFileHeader.SortOrder.coordinate,
|
||||
false,
|
||||
SAMFileReader.ValidationStringency.STRICT,
|
||||
downsamplingMethod,
|
||||
null,
|
||||
new ValidationExclusion(),
|
||||
Collections.<ReadFilter>emptyList(),
|
||||
Collections.<ReadTransformer>emptyList(),
|
||||
|
|
@ -513,136 +344,137 @@ public class LocusIteratorByStateExperimentalUnitTest extends BaseTest {
|
|||
(byte) -1
|
||||
);
|
||||
}
|
||||
}
|
||||
|
||||
private static class FakeCloseableIterator<T> implements CloseableIterator<T> {
|
||||
Iterator<T> iterator;
|
||||
class FakeCloseableIterator<T> implements CloseableIterator<T> {
|
||||
Iterator<T> iterator;
|
||||
|
||||
public FakeCloseableIterator(Iterator<T> it) {
|
||||
iterator = it;
|
||||
}
|
||||
|
||||
@Override
|
||||
public void close() {}
|
||||
|
||||
@Override
|
||||
public boolean hasNext() {
|
||||
return iterator.hasNext();
|
||||
}
|
||||
|
||||
@Override
|
||||
public T next() {
|
||||
return iterator.next();
|
||||
}
|
||||
|
||||
@Override
|
||||
public void remove() {
|
||||
throw new UnsupportedOperationException("Don't remove!");
|
||||
}
|
||||
public FakeCloseableIterator(Iterator<T> it) {
|
||||
iterator = it;
|
||||
}
|
||||
|
||||
private static final class LIBS_position {
|
||||
@Override
|
||||
public void close() {}
|
||||
|
||||
SAMRecord read;
|
||||
@Override
|
||||
public boolean hasNext() {
|
||||
return iterator.hasNext();
|
||||
}
|
||||
|
||||
final int numOperators;
|
||||
int currentOperatorIndex = 0;
|
||||
int currentPositionOnOperator = 0;
|
||||
int currentReadOffset = 0;
|
||||
@Override
|
||||
public T next() {
|
||||
return iterator.next();
|
||||
}
|
||||
|
||||
boolean isBeforeDeletionStart = false;
|
||||
boolean isBeforeDeletedBase = false;
|
||||
boolean isAfterDeletionEnd = false;
|
||||
boolean isAfterDeletedBase = false;
|
||||
boolean isBeforeInsertion = false;
|
||||
boolean isAfterInsertion = false;
|
||||
boolean isNextToSoftClip = false;
|
||||
|
||||
boolean sawMop = false;
|
||||
|
||||
public LIBS_position(final SAMRecord read) {
|
||||
this.read = read;
|
||||
numOperators = read.getCigar().numCigarElements();
|
||||
}
|
||||
|
||||
public int getCurrentReadOffset() {
|
||||
return Math.max(0, currentReadOffset - 1);
|
||||
}
|
||||
|
||||
/**
|
||||
* Steps forward on the genome. Returns false when done reading the read, true otherwise.
|
||||
*/
|
||||
public boolean stepForwardOnGenome() {
|
||||
if ( currentOperatorIndex == numOperators )
|
||||
return false;
|
||||
|
||||
CigarElement curElement = read.getCigar().getCigarElement(currentOperatorIndex);
|
||||
if ( currentPositionOnOperator >= curElement.getLength() ) {
|
||||
if ( ++currentOperatorIndex == numOperators )
|
||||
return false;
|
||||
|
||||
curElement = read.getCigar().getCigarElement(currentOperatorIndex);
|
||||
currentPositionOnOperator = 0;
|
||||
}
|
||||
|
||||
switch ( curElement.getOperator() ) {
|
||||
case I: // insertion w.r.t. the reference
|
||||
if ( !sawMop )
|
||||
break;
|
||||
case S: // soft clip
|
||||
currentReadOffset += curElement.getLength();
|
||||
case H: // hard clip
|
||||
case P: // padding
|
||||
currentOperatorIndex++;
|
||||
return stepForwardOnGenome();
|
||||
|
||||
case D: // deletion w.r.t. the reference
|
||||
case N: // reference skip (looks and gets processed just like a "deletion", just different logical meaning)
|
||||
currentPositionOnOperator++;
|
||||
break;
|
||||
|
||||
case M:
|
||||
case EQ:
|
||||
case X:
|
||||
sawMop = true;
|
||||
currentReadOffset++;
|
||||
currentPositionOnOperator++;
|
||||
break;
|
||||
default:
|
||||
throw new IllegalStateException("No support for cigar op: " + curElement.getOperator());
|
||||
}
|
||||
|
||||
final boolean isFirstOp = currentOperatorIndex == 0;
|
||||
final boolean isLastOp = currentOperatorIndex == numOperators - 1;
|
||||
final boolean isFirstBaseOfOp = currentPositionOnOperator == 1;
|
||||
final boolean isLastBaseOfOp = currentPositionOnOperator == curElement.getLength();
|
||||
|
||||
isBeforeDeletionStart = isBeforeOp(read.getCigar(), currentOperatorIndex, CigarOperator.D, isLastOp, isLastBaseOfOp);
|
||||
isBeforeDeletedBase = isBeforeDeletionStart || (!isLastBaseOfOp && curElement.getOperator() == CigarOperator.D);
|
||||
isAfterDeletionEnd = isAfterOp(read.getCigar(), currentOperatorIndex, CigarOperator.D, isFirstOp, isFirstBaseOfOp);
|
||||
isAfterDeletedBase = isAfterDeletionEnd || (!isFirstBaseOfOp && curElement.getOperator() == CigarOperator.D);
|
||||
isBeforeInsertion = isBeforeOp(read.getCigar(), currentOperatorIndex, CigarOperator.I, isLastOp, isLastBaseOfOp)
|
||||
|| (!sawMop && curElement.getOperator() == CigarOperator.I);
|
||||
isAfterInsertion = isAfterOp(read.getCigar(), currentOperatorIndex, CigarOperator.I, isFirstOp, isFirstBaseOfOp);
|
||||
isNextToSoftClip = isBeforeOp(read.getCigar(), currentOperatorIndex, CigarOperator.S, isLastOp, isLastBaseOfOp)
|
||||
|| isAfterOp(read.getCigar(), currentOperatorIndex, CigarOperator.S, isFirstOp, isFirstBaseOfOp);
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
private static boolean isBeforeOp(final Cigar cigar,
|
||||
final int currentOperatorIndex,
|
||||
final CigarOperator op,
|
||||
final boolean isLastOp,
|
||||
final boolean isLastBaseOfOp) {
|
||||
return !isLastOp && isLastBaseOfOp && cigar.getCigarElement(currentOperatorIndex+1).getOperator() == op;
|
||||
}
|
||||
|
||||
private static boolean isAfterOp(final Cigar cigar,
|
||||
final int currentOperatorIndex,
|
||||
final CigarOperator op,
|
||||
final boolean isFirstOp,
|
||||
final boolean isFirstBaseOfOp) {
|
||||
return !isFirstOp && isFirstBaseOfOp && cigar.getCigarElement(currentOperatorIndex-1).getOperator() == op;
|
||||
}
|
||||
@Override
|
||||
public void remove() {
|
||||
throw new UnsupportedOperationException("Don't remove!");
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
final class LIBS_position {
|
||||
|
||||
SAMRecord read;
|
||||
|
||||
final int numOperators;
|
||||
int currentOperatorIndex = 0;
|
||||
int currentPositionOnOperator = 0;
|
||||
int currentReadOffset = 0;
|
||||
|
||||
boolean isBeforeDeletionStart = false;
|
||||
boolean isBeforeDeletedBase = false;
|
||||
boolean isAfterDeletionEnd = false;
|
||||
boolean isAfterDeletedBase = false;
|
||||
boolean isBeforeInsertion = false;
|
||||
boolean isAfterInsertion = false;
|
||||
boolean isNextToSoftClip = false;
|
||||
|
||||
boolean sawMop = false;
|
||||
|
||||
public LIBS_position(final SAMRecord read) {
|
||||
this.read = read;
|
||||
numOperators = read.getCigar().numCigarElements();
|
||||
}
|
||||
|
||||
public int getCurrentReadOffset() {
|
||||
return Math.max(0, currentReadOffset - 1);
|
||||
}
|
||||
|
||||
/**
|
||||
* Steps forward on the genome. Returns false when done reading the read, true otherwise.
|
||||
*/
|
||||
public boolean stepForwardOnGenome() {
|
||||
if ( currentOperatorIndex == numOperators )
|
||||
return false;
|
||||
|
||||
CigarElement curElement = read.getCigar().getCigarElement(currentOperatorIndex);
|
||||
if ( currentPositionOnOperator >= curElement.getLength() ) {
|
||||
if ( ++currentOperatorIndex == numOperators )
|
||||
return false;
|
||||
|
||||
curElement = read.getCigar().getCigarElement(currentOperatorIndex);
|
||||
currentPositionOnOperator = 0;
|
||||
}
|
||||
|
||||
switch ( curElement.getOperator() ) {
|
||||
case I: // insertion w.r.t. the reference
|
||||
if ( !sawMop )
|
||||
break;
|
||||
case S: // soft clip
|
||||
currentReadOffset += curElement.getLength();
|
||||
case H: // hard clip
|
||||
case P: // padding
|
||||
currentOperatorIndex++;
|
||||
return stepForwardOnGenome();
|
||||
|
||||
case D: // deletion w.r.t. the reference
|
||||
case N: // reference skip (looks and gets processed just like a "deletion", just different logical meaning)
|
||||
currentPositionOnOperator++;
|
||||
break;
|
||||
|
||||
case M:
|
||||
case EQ:
|
||||
case X:
|
||||
sawMop = true;
|
||||
currentReadOffset++;
|
||||
currentPositionOnOperator++;
|
||||
break;
|
||||
default:
|
||||
throw new IllegalStateException("No support for cigar op: " + curElement.getOperator());
|
||||
}
|
||||
|
||||
final boolean isFirstOp = currentOperatorIndex == 0;
|
||||
final boolean isLastOp = currentOperatorIndex == numOperators - 1;
|
||||
final boolean isFirstBaseOfOp = currentPositionOnOperator == 1;
|
||||
final boolean isLastBaseOfOp = currentPositionOnOperator == curElement.getLength();
|
||||
|
||||
isBeforeDeletionStart = isBeforeOp(read.getCigar(), currentOperatorIndex, CigarOperator.D, isLastOp, isLastBaseOfOp);
|
||||
isBeforeDeletedBase = isBeforeDeletionStart || (!isLastBaseOfOp && curElement.getOperator() == CigarOperator.D);
|
||||
isAfterDeletionEnd = isAfterOp(read.getCigar(), currentOperatorIndex, CigarOperator.D, isFirstOp, isFirstBaseOfOp);
|
||||
isAfterDeletedBase = isAfterDeletionEnd || (!isFirstBaseOfOp && curElement.getOperator() == CigarOperator.D);
|
||||
isBeforeInsertion = isBeforeOp(read.getCigar(), currentOperatorIndex, CigarOperator.I, isLastOp, isLastBaseOfOp)
|
||||
|| (!sawMop && curElement.getOperator() == CigarOperator.I);
|
||||
isAfterInsertion = isAfterOp(read.getCigar(), currentOperatorIndex, CigarOperator.I, isFirstOp, isFirstBaseOfOp);
|
||||
isNextToSoftClip = isBeforeOp(read.getCigar(), currentOperatorIndex, CigarOperator.S, isLastOp, isLastBaseOfOp)
|
||||
|| isAfterOp(read.getCigar(), currentOperatorIndex, CigarOperator.S, isFirstOp, isFirstBaseOfOp);
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
private static boolean isBeforeOp(final Cigar cigar,
|
||||
final int currentOperatorIndex,
|
||||
final CigarOperator op,
|
||||
final boolean isLastOp,
|
||||
final boolean isLastBaseOfOp) {
|
||||
return !isLastOp && isLastBaseOfOp && cigar.getCigarElement(currentOperatorIndex+1).getOperator() == op;
|
||||
}
|
||||
|
||||
private static boolean isAfterOp(final Cigar cigar,
|
||||
final int currentOperatorIndex,
|
||||
final CigarOperator op,
|
||||
final boolean isFirstOp,
|
||||
final boolean isFirstBaseOfOp) {
|
||||
return !isFirstOp && isFirstBaseOfOp && cigar.getCigarElement(currentOperatorIndex-1).getOperator() == op;
|
||||
}
|
||||
}
|
||||
|
|
@ -7,8 +7,10 @@ import org.broadinstitute.sting.gatk.ReadProperties;
|
|||
import org.broadinstitute.sting.gatk.arguments.ValidationExclusion;
|
||||
import org.broadinstitute.sting.gatk.contexts.AlignmentContext;
|
||||
import org.broadinstitute.sting.gatk.datasources.reads.SAMReaderID;
|
||||
import org.broadinstitute.sting.gatk.downsampling.DownsamplingMethod;
|
||||
import org.broadinstitute.sting.gatk.filters.ReadFilter;
|
||||
import org.broadinstitute.sting.utils.GenomeLocParser;
|
||||
import org.broadinstitute.sting.utils.MathUtils;
|
||||
import org.broadinstitute.sting.utils.Utils;
|
||||
import org.broadinstitute.sting.utils.pileup.PileupElement;
|
||||
import org.broadinstitute.sting.utils.pileup.ReadBackedPileup;
|
||||
|
|
@ -19,13 +21,10 @@ import org.testng.annotations.BeforeClass;
|
|||
import org.testng.annotations.DataProvider;
|
||||
import org.testng.annotations.Test;
|
||||
|
||||
import java.util.Arrays;
|
||||
import java.util.Collections;
|
||||
import java.util.Iterator;
|
||||
import java.util.List;
|
||||
import java.util.*;
|
||||
|
||||
/**
|
||||
* testing of the LocusIteratorByState
|
||||
* testing of the new (non-legacy) version of LocusIteratorByState
|
||||
*/
|
||||
public class LocusIteratorByStateUnitTest extends BaseTest {
|
||||
private static SAMFileHeader header;
|
||||
|
|
@ -329,14 +328,184 @@ public class LocusIteratorByStateUnitTest extends BaseTest {
|
|||
// End comprehensive LIBS/PileupElement tests //
|
||||
////////////////////////////////////////////////
|
||||
|
||||
|
||||
///////////////////////////////////////
|
||||
// Read State Manager Tests //
|
||||
///////////////////////////////////////
|
||||
|
||||
private class PerSampleReadStateManagerTest extends TestDataProvider {
|
||||
private List<Integer> readCountsPerAlignmentStart;
|
||||
private List<SAMRecord> reads;
|
||||
private List<ArrayList<LocusIteratorByState.SAMRecordState>> recordStatesByAlignmentStart;
|
||||
private int removalInterval;
|
||||
|
||||
public PerSampleReadStateManagerTest( List<Integer> readCountsPerAlignmentStart, int removalInterval ) {
|
||||
super(PerSampleReadStateManagerTest.class);
|
||||
|
||||
this.readCountsPerAlignmentStart = readCountsPerAlignmentStart;
|
||||
this.removalInterval = removalInterval;
|
||||
|
||||
reads = new ArrayList<SAMRecord>();
|
||||
recordStatesByAlignmentStart = new ArrayList<ArrayList<LocusIteratorByState.SAMRecordState>>();
|
||||
|
||||
setName(String.format("%s: readCountsPerAlignmentStart: %s removalInterval: %d",
|
||||
getClass().getSimpleName(), readCountsPerAlignmentStart, removalInterval));
|
||||
}
|
||||
|
||||
public void run() {
|
||||
LocusIteratorByState libs = makeLTBS(new ArrayList<SAMRecord>(), createTestReadProperties());
|
||||
LocusIteratorByState.ReadStateManager readStateManager =
|
||||
libs.new ReadStateManager(new ArrayList<SAMRecord>().iterator());
|
||||
LocusIteratorByState.ReadStateManager.PerSampleReadStateManager perSampleReadStateManager =
|
||||
readStateManager.new PerSampleReadStateManager();
|
||||
|
||||
makeReads();
|
||||
|
||||
for ( ArrayList<LocusIteratorByState.SAMRecordState> stackRecordStates : recordStatesByAlignmentStart ) {
|
||||
perSampleReadStateManager.addStatesAtNextAlignmentStart(stackRecordStates);
|
||||
}
|
||||
|
||||
// read state manager should have the right number of reads
|
||||
Assert.assertEquals(reads.size(), perSampleReadStateManager.size());
|
||||
|
||||
Iterator<SAMRecord> originalReadsIterator = reads.iterator();
|
||||
Iterator<LocusIteratorByState.SAMRecordState> recordStateIterator = perSampleReadStateManager.iterator();
|
||||
int recordStateCount = 0;
|
||||
int numReadStatesRemoved = 0;
|
||||
|
||||
// Do a first-pass validation of the record state iteration by making sure we get back everything we
|
||||
// put in, in the same order, doing any requested removals of read states along the way
|
||||
while ( recordStateIterator.hasNext() ) {
|
||||
LocusIteratorByState.SAMRecordState readState = recordStateIterator.next();
|
||||
recordStateCount++;
|
||||
SAMRecord readFromPerSampleReadStateManager = readState.getRead();
|
||||
|
||||
Assert.assertTrue(originalReadsIterator.hasNext());
|
||||
SAMRecord originalRead = originalReadsIterator.next();
|
||||
|
||||
// The read we get back should be literally the same read in memory as we put in
|
||||
Assert.assertTrue(originalRead == readFromPerSampleReadStateManager);
|
||||
|
||||
// If requested, remove a read state every removalInterval states
|
||||
if ( removalInterval > 0 && recordStateCount % removalInterval == 0 ) {
|
||||
recordStateIterator.remove();
|
||||
numReadStatesRemoved++;
|
||||
}
|
||||
}
|
||||
|
||||
Assert.assertFalse(originalReadsIterator.hasNext());
|
||||
|
||||
// If we removed any read states, do a second pass through the read states to make sure the right
|
||||
// states were removed
|
||||
if ( numReadStatesRemoved > 0 ) {
|
||||
Assert.assertEquals(perSampleReadStateManager.size(), reads.size() - numReadStatesRemoved);
|
||||
|
||||
originalReadsIterator = reads.iterator();
|
||||
recordStateIterator = perSampleReadStateManager.iterator();
|
||||
int readCount = 0;
|
||||
int readStateCount = 0;
|
||||
|
||||
// Match record states with the reads that should remain after removal
|
||||
while ( recordStateIterator.hasNext() ) {
|
||||
LocusIteratorByState.SAMRecordState readState = recordStateIterator.next();
|
||||
readStateCount++;
|
||||
SAMRecord readFromPerSampleReadStateManager = readState.getRead();
|
||||
|
||||
Assert.assertTrue(originalReadsIterator.hasNext());
|
||||
|
||||
SAMRecord originalRead = originalReadsIterator.next();
|
||||
readCount++;
|
||||
|
||||
if ( readCount % removalInterval == 0 ) {
|
||||
originalRead = originalReadsIterator.next(); // advance to next read, since the previous one should have been discarded
|
||||
readCount++;
|
||||
}
|
||||
|
||||
// The read we get back should be literally the same read in memory as we put in (after accounting for removals)
|
||||
Assert.assertTrue(originalRead == readFromPerSampleReadStateManager);
|
||||
}
|
||||
|
||||
Assert.assertEquals(readStateCount, reads.size() - numReadStatesRemoved);
|
||||
}
|
||||
|
||||
// Allow memory used by this test to be reclaimed
|
||||
readCountsPerAlignmentStart = null;
|
||||
reads = null;
|
||||
recordStatesByAlignmentStart = null;
|
||||
}
|
||||
|
||||
private void makeReads() {
|
||||
int alignmentStart = 1;
|
||||
|
||||
for ( int readsThisStack : readCountsPerAlignmentStart ) {
|
||||
ArrayList<SAMRecord> stackReads = new ArrayList<SAMRecord>(ArtificialSAMUtils.createStackOfIdenticalArtificialReads(readsThisStack, header, "foo", 0, alignmentStart, MathUtils.randomIntegerInRange(50, 100)));
|
||||
ArrayList<LocusIteratorByState.SAMRecordState> stackRecordStates = new ArrayList<LocusIteratorByState.SAMRecordState>();
|
||||
|
||||
for ( SAMRecord read : stackReads ) {
|
||||
stackRecordStates.add(new LocusIteratorByState.SAMRecordState(read));
|
||||
}
|
||||
|
||||
reads.addAll(stackReads);
|
||||
recordStatesByAlignmentStart.add(stackRecordStates);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@DataProvider(name = "PerSampleReadStateManagerTestDataProvider")
|
||||
public Object[][] createPerSampleReadStateManagerTests() {
|
||||
for ( List<Integer> thisTestReadStateCounts : Arrays.asList( Arrays.asList(1),
|
||||
Arrays.asList(2),
|
||||
Arrays.asList(10),
|
||||
Arrays.asList(1, 1),
|
||||
Arrays.asList(2, 2),
|
||||
Arrays.asList(10, 10),
|
||||
Arrays.asList(1, 10),
|
||||
Arrays.asList(10, 1),
|
||||
Arrays.asList(1, 1, 1),
|
||||
Arrays.asList(2, 2, 2),
|
||||
Arrays.asList(10, 10, 10),
|
||||
Arrays.asList(1, 1, 1, 1, 1, 1),
|
||||
Arrays.asList(10, 10, 10, 10, 10, 10),
|
||||
Arrays.asList(1, 2, 10, 1, 2, 10)
|
||||
) ) {
|
||||
|
||||
for ( int removalInterval : Arrays.asList(0, 2, 3) ) {
|
||||
new PerSampleReadStateManagerTest(thisTestReadStateCounts, removalInterval);
|
||||
}
|
||||
}
|
||||
|
||||
return PerSampleReadStateManagerTest.getTests(PerSampleReadStateManagerTest.class);
|
||||
}
|
||||
|
||||
@Test(dataProvider = "PerSampleReadStateManagerTestDataProvider")
|
||||
public void runPerSampleReadStateManagerTest( PerSampleReadStateManagerTest test ) {
|
||||
logger.warn("Running test: " + test);
|
||||
|
||||
test.run();
|
||||
}
|
||||
|
||||
///////////////////////////////////////
|
||||
// End Read State Manager Tests //
|
||||
///////////////////////////////////////
|
||||
|
||||
|
||||
|
||||
///////////////////////////////////////
|
||||
// Helper methods / classes //
|
||||
///////////////////////////////////////
|
||||
|
||||
private static ReadProperties createTestReadProperties() {
|
||||
return createTestReadProperties(null);
|
||||
}
|
||||
|
||||
private static ReadProperties createTestReadProperties( DownsamplingMethod downsamplingMethod ) {
|
||||
return new ReadProperties(
|
||||
Collections.<SAMReaderID>emptyList(),
|
||||
new SAMFileHeader(),
|
||||
SAMFileHeader.SortOrder.coordinate,
|
||||
false,
|
||||
SAMFileReader.ValidationStringency.STRICT,
|
||||
null,
|
||||
downsamplingMethod,
|
||||
new ValidationExclusion(),
|
||||
Collections.<ReadFilter>emptyList(),
|
||||
Collections.<ReadTransformer>emptyList(),
|
||||
|
|
@ -344,137 +513,136 @@ public class LocusIteratorByStateUnitTest extends BaseTest {
|
|||
(byte) -1
|
||||
);
|
||||
}
|
||||
}
|
||||
|
||||
class FakeCloseableIterator<T> implements CloseableIterator<T> {
|
||||
Iterator<T> iterator;
|
||||
private static class FakeCloseableIterator<T> implements CloseableIterator<T> {
|
||||
Iterator<T> iterator;
|
||||
|
||||
public FakeCloseableIterator(Iterator<T> it) {
|
||||
iterator = it;
|
||||
public FakeCloseableIterator(Iterator<T> it) {
|
||||
iterator = it;
|
||||
}
|
||||
|
||||
@Override
|
||||
public void close() {}
|
||||
|
||||
@Override
|
||||
public boolean hasNext() {
|
||||
return iterator.hasNext();
|
||||
}
|
||||
|
||||
@Override
|
||||
public T next() {
|
||||
return iterator.next();
|
||||
}
|
||||
|
||||
@Override
|
||||
public void remove() {
|
||||
throw new UnsupportedOperationException("Don't remove!");
|
||||
}
|
||||
}
|
||||
|
||||
@Override
|
||||
public void close() {}
|
||||
private static final class LIBS_position {
|
||||
|
||||
@Override
|
||||
public boolean hasNext() {
|
||||
return iterator.hasNext();
|
||||
}
|
||||
SAMRecord read;
|
||||
|
||||
@Override
|
||||
public T next() {
|
||||
return iterator.next();
|
||||
}
|
||||
final int numOperators;
|
||||
int currentOperatorIndex = 0;
|
||||
int currentPositionOnOperator = 0;
|
||||
int currentReadOffset = 0;
|
||||
|
||||
@Override
|
||||
public void remove() {
|
||||
throw new UnsupportedOperationException("Don't remove!");
|
||||
}
|
||||
}
|
||||
boolean isBeforeDeletionStart = false;
|
||||
boolean isBeforeDeletedBase = false;
|
||||
boolean isAfterDeletionEnd = false;
|
||||
boolean isAfterDeletedBase = false;
|
||||
boolean isBeforeInsertion = false;
|
||||
boolean isAfterInsertion = false;
|
||||
boolean isNextToSoftClip = false;
|
||||
|
||||
boolean sawMop = false;
|
||||
|
||||
final class LIBS_position {
|
||||
public LIBS_position(final SAMRecord read) {
|
||||
this.read = read;
|
||||
numOperators = read.getCigar().numCigarElements();
|
||||
}
|
||||
|
||||
SAMRecord read;
|
||||
public int getCurrentReadOffset() {
|
||||
return Math.max(0, currentReadOffset - 1);
|
||||
}
|
||||
|
||||
final int numOperators;
|
||||
int currentOperatorIndex = 0;
|
||||
int currentPositionOnOperator = 0;
|
||||
int currentReadOffset = 0;
|
||||
|
||||
boolean isBeforeDeletionStart = false;
|
||||
boolean isBeforeDeletedBase = false;
|
||||
boolean isAfterDeletionEnd = false;
|
||||
boolean isAfterDeletedBase = false;
|
||||
boolean isBeforeInsertion = false;
|
||||
boolean isAfterInsertion = false;
|
||||
boolean isNextToSoftClip = false;
|
||||
|
||||
boolean sawMop = false;
|
||||
|
||||
public LIBS_position(final SAMRecord read) {
|
||||
this.read = read;
|
||||
numOperators = read.getCigar().numCigarElements();
|
||||
}
|
||||
|
||||
public int getCurrentReadOffset() {
|
||||
return Math.max(0, currentReadOffset - 1);
|
||||
}
|
||||
|
||||
/**
|
||||
* Steps forward on the genome. Returns false when done reading the read, true otherwise.
|
||||
*/
|
||||
public boolean stepForwardOnGenome() {
|
||||
if ( currentOperatorIndex == numOperators )
|
||||
return false;
|
||||
|
||||
CigarElement curElement = read.getCigar().getCigarElement(currentOperatorIndex);
|
||||
if ( currentPositionOnOperator >= curElement.getLength() ) {
|
||||
if ( ++currentOperatorIndex == numOperators )
|
||||
/**
|
||||
* Steps forward on the genome. Returns false when done reading the read, true otherwise.
|
||||
*/
|
||||
public boolean stepForwardOnGenome() {
|
||||
if ( currentOperatorIndex == numOperators )
|
||||
return false;
|
||||
|
||||
curElement = read.getCigar().getCigarElement(currentOperatorIndex);
|
||||
currentPositionOnOperator = 0;
|
||||
}
|
||||
CigarElement curElement = read.getCigar().getCigarElement(currentOperatorIndex);
|
||||
if ( currentPositionOnOperator >= curElement.getLength() ) {
|
||||
if ( ++currentOperatorIndex == numOperators )
|
||||
return false;
|
||||
|
||||
switch ( curElement.getOperator() ) {
|
||||
case I: // insertion w.r.t. the reference
|
||||
if ( !sawMop )
|
||||
curElement = read.getCigar().getCigarElement(currentOperatorIndex);
|
||||
currentPositionOnOperator = 0;
|
||||
}
|
||||
|
||||
switch ( curElement.getOperator() ) {
|
||||
case I: // insertion w.r.t. the reference
|
||||
if ( !sawMop )
|
||||
break;
|
||||
case S: // soft clip
|
||||
currentReadOffset += curElement.getLength();
|
||||
case H: // hard clip
|
||||
case P: // padding
|
||||
currentOperatorIndex++;
|
||||
return stepForwardOnGenome();
|
||||
|
||||
case D: // deletion w.r.t. the reference
|
||||
case N: // reference skip (looks and gets processed just like a "deletion", just different logical meaning)
|
||||
currentPositionOnOperator++;
|
||||
break;
|
||||
case S: // soft clip
|
||||
currentReadOffset += curElement.getLength();
|
||||
case H: // hard clip
|
||||
case P: // padding
|
||||
currentOperatorIndex++;
|
||||
return stepForwardOnGenome();
|
||||
|
||||
case D: // deletion w.r.t. the reference
|
||||
case N: // reference skip (looks and gets processed just like a "deletion", just different logical meaning)
|
||||
currentPositionOnOperator++;
|
||||
break;
|
||||
case M:
|
||||
case EQ:
|
||||
case X:
|
||||
sawMop = true;
|
||||
currentReadOffset++;
|
||||
currentPositionOnOperator++;
|
||||
break;
|
||||
default:
|
||||
throw new IllegalStateException("No support for cigar op: " + curElement.getOperator());
|
||||
}
|
||||
|
||||
case M:
|
||||
case EQ:
|
||||
case X:
|
||||
sawMop = true;
|
||||
currentReadOffset++;
|
||||
currentPositionOnOperator++;
|
||||
break;
|
||||
default:
|
||||
throw new IllegalStateException("No support for cigar op: " + curElement.getOperator());
|
||||
final boolean isFirstOp = currentOperatorIndex == 0;
|
||||
final boolean isLastOp = currentOperatorIndex == numOperators - 1;
|
||||
final boolean isFirstBaseOfOp = currentPositionOnOperator == 1;
|
||||
final boolean isLastBaseOfOp = currentPositionOnOperator == curElement.getLength();
|
||||
|
||||
isBeforeDeletionStart = isBeforeOp(read.getCigar(), currentOperatorIndex, CigarOperator.D, isLastOp, isLastBaseOfOp);
|
||||
isBeforeDeletedBase = isBeforeDeletionStart || (!isLastBaseOfOp && curElement.getOperator() == CigarOperator.D);
|
||||
isAfterDeletionEnd = isAfterOp(read.getCigar(), currentOperatorIndex, CigarOperator.D, isFirstOp, isFirstBaseOfOp);
|
||||
isAfterDeletedBase = isAfterDeletionEnd || (!isFirstBaseOfOp && curElement.getOperator() == CigarOperator.D);
|
||||
isBeforeInsertion = isBeforeOp(read.getCigar(), currentOperatorIndex, CigarOperator.I, isLastOp, isLastBaseOfOp)
|
||||
|| (!sawMop && curElement.getOperator() == CigarOperator.I);
|
||||
isAfterInsertion = isAfterOp(read.getCigar(), currentOperatorIndex, CigarOperator.I, isFirstOp, isFirstBaseOfOp);
|
||||
isNextToSoftClip = isBeforeOp(read.getCigar(), currentOperatorIndex, CigarOperator.S, isLastOp, isLastBaseOfOp)
|
||||
|| isAfterOp(read.getCigar(), currentOperatorIndex, CigarOperator.S, isFirstOp, isFirstBaseOfOp);
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
final boolean isFirstOp = currentOperatorIndex == 0;
|
||||
final boolean isLastOp = currentOperatorIndex == numOperators - 1;
|
||||
final boolean isFirstBaseOfOp = currentPositionOnOperator == 1;
|
||||
final boolean isLastBaseOfOp = currentPositionOnOperator == curElement.getLength();
|
||||
private static boolean isBeforeOp(final Cigar cigar,
|
||||
final int currentOperatorIndex,
|
||||
final CigarOperator op,
|
||||
final boolean isLastOp,
|
||||
final boolean isLastBaseOfOp) {
|
||||
return !isLastOp && isLastBaseOfOp && cigar.getCigarElement(currentOperatorIndex+1).getOperator() == op;
|
||||
}
|
||||
|
||||
isBeforeDeletionStart = isBeforeOp(read.getCigar(), currentOperatorIndex, CigarOperator.D, isLastOp, isLastBaseOfOp);
|
||||
isBeforeDeletedBase = isBeforeDeletionStart || (!isLastBaseOfOp && curElement.getOperator() == CigarOperator.D);
|
||||
isAfterDeletionEnd = isAfterOp(read.getCigar(), currentOperatorIndex, CigarOperator.D, isFirstOp, isFirstBaseOfOp);
|
||||
isAfterDeletedBase = isAfterDeletionEnd || (!isFirstBaseOfOp && curElement.getOperator() == CigarOperator.D);
|
||||
isBeforeInsertion = isBeforeOp(read.getCigar(), currentOperatorIndex, CigarOperator.I, isLastOp, isLastBaseOfOp)
|
||||
|| (!sawMop && curElement.getOperator() == CigarOperator.I);
|
||||
isAfterInsertion = isAfterOp(read.getCigar(), currentOperatorIndex, CigarOperator.I, isFirstOp, isFirstBaseOfOp);
|
||||
isNextToSoftClip = isBeforeOp(read.getCigar(), currentOperatorIndex, CigarOperator.S, isLastOp, isLastBaseOfOp)
|
||||
|| isAfterOp(read.getCigar(), currentOperatorIndex, CigarOperator.S, isFirstOp, isFirstBaseOfOp);
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
private static boolean isBeforeOp(final Cigar cigar,
|
||||
final int currentOperatorIndex,
|
||||
final CigarOperator op,
|
||||
final boolean isLastOp,
|
||||
final boolean isLastBaseOfOp) {
|
||||
return !isLastOp && isLastBaseOfOp && cigar.getCigarElement(currentOperatorIndex+1).getOperator() == op;
|
||||
}
|
||||
|
||||
private static boolean isAfterOp(final Cigar cigar,
|
||||
final int currentOperatorIndex,
|
||||
final CigarOperator op,
|
||||
final boolean isFirstOp,
|
||||
final boolean isFirstBaseOfOp) {
|
||||
return !isFirstOp && isFirstBaseOfOp && cigar.getCigarElement(currentOperatorIndex-1).getOperator() == op;
|
||||
private static boolean isAfterOp(final Cigar cigar,
|
||||
final int currentOperatorIndex,
|
||||
final CigarOperator op,
|
||||
final boolean isFirstOp,
|
||||
final boolean isFirstBaseOfOp) {
|
||||
return !isFirstOp && isFirstBaseOfOp && cigar.getCigarElement(currentOperatorIndex-1).getOperator() == op;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
|
|||
|
|
@ -6,7 +6,7 @@ import org.broadinstitute.sting.BaseTest;
|
|||
import org.broadinstitute.sting.commandline.Tags;
|
||||
import org.broadinstitute.sting.gatk.GenomeAnalysisEngine;
|
||||
import org.broadinstitute.sting.gatk.datasources.providers.ReadShardDataProvider;
|
||||
import org.broadinstitute.sting.gatk.datasources.reads.ReadShardBalancer;
|
||||
import org.broadinstitute.sting.gatk.datasources.reads.LegacyReadShardBalancer;
|
||||
import org.broadinstitute.sting.gatk.datasources.reads.SAMDataSource;
|
||||
import org.broadinstitute.sting.gatk.datasources.reads.SAMReaderID;
|
||||
import org.broadinstitute.sting.gatk.datasources.reads.Shard;
|
||||
|
|
@ -114,7 +114,7 @@ public class TraverseReadsUnitTest extends BaseTest {
|
|||
@Test
|
||||
public void testUnmappedReadCount() {
|
||||
SAMDataSource dataSource = new SAMDataSource(bamList,new ThreadAllocation(),null,genomeLocParser);
|
||||
Iterable<Shard> shardStrategy = dataSource.createShardIteratorOverAllReads(new ReadShardBalancer());
|
||||
Iterable<Shard> shardStrategy = dataSource.createShardIteratorOverAllReads(new LegacyReadShardBalancer());
|
||||
|
||||
countReadWalker.initialize();
|
||||
Object accumulator = countReadWalker.reduceInit();
|
||||
|
|
|
|||
|
|
@ -23,14 +23,14 @@ public class LegacyReservoirDownsamplerUnitTest {
|
|||
|
||||
@Test
|
||||
public void testEmptyIterator() {
|
||||
ReservoirDownsampler<SAMRecord> downsampler = new ReservoirDownsampler<SAMRecord>(1);
|
||||
LegacyReservoirDownsampler<SAMRecord> downsampler = new LegacyReservoirDownsampler<SAMRecord>(1);
|
||||
Assert.assertTrue(downsampler.isEmpty(),"Downsampler is not empty but should be.");
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testOneElementWithPoolSizeOne() {
|
||||
List<GATKSAMRecord> reads = Collections.singletonList(ArtificialSAMUtils.createArtificialRead(header,"read1",0,1,76));
|
||||
ReservoirDownsampler<SAMRecord> downsampler = new ReservoirDownsampler<SAMRecord>(1);
|
||||
LegacyReservoirDownsampler<SAMRecord> downsampler = new LegacyReservoirDownsampler<SAMRecord>(1);
|
||||
downsampler.addAll(reads);
|
||||
|
||||
Assert.assertFalse(downsampler.isEmpty(),"Downsampler is empty but shouldn't be");
|
||||
|
|
@ -42,7 +42,7 @@ public class LegacyReservoirDownsamplerUnitTest {
|
|||
@Test
|
||||
public void testOneElementWithPoolSizeGreaterThanOne() {
|
||||
List<GATKSAMRecord> reads = Collections.singletonList(ArtificialSAMUtils.createArtificialRead(header,"read1",0,1,76));
|
||||
ReservoirDownsampler<SAMRecord> downsampler = new ReservoirDownsampler<SAMRecord>(5);
|
||||
LegacyReservoirDownsampler<SAMRecord> downsampler = new LegacyReservoirDownsampler<SAMRecord>(5);
|
||||
downsampler.addAll(reads);
|
||||
|
||||
Assert.assertFalse(downsampler.isEmpty(),"Downsampler is empty but shouldn't be");
|
||||
|
|
@ -58,7 +58,7 @@ public class LegacyReservoirDownsamplerUnitTest {
|
|||
reads.add(ArtificialSAMUtils.createArtificialRead(header,"read1",0,1,76));
|
||||
reads.add(ArtificialSAMUtils.createArtificialRead(header,"read2",0,1,76));
|
||||
reads.add(ArtificialSAMUtils.createArtificialRead(header,"read3",0,1,76));
|
||||
ReservoirDownsampler<SAMRecord> downsampler = new ReservoirDownsampler<SAMRecord>(5);
|
||||
LegacyReservoirDownsampler<SAMRecord> downsampler = new LegacyReservoirDownsampler<SAMRecord>(5);
|
||||
downsampler.addAll(reads);
|
||||
|
||||
Assert.assertFalse(downsampler.isEmpty(),"Downsampler is empty but shouldn't be");
|
||||
|
|
@ -78,7 +78,7 @@ public class LegacyReservoirDownsamplerUnitTest {
|
|||
reads.add(ArtificialSAMUtils.createArtificialRead(header,"read3",0,1,76));
|
||||
reads.add(ArtificialSAMUtils.createArtificialRead(header,"read4",0,1,76));
|
||||
reads.add(ArtificialSAMUtils.createArtificialRead(header,"read5",0,1,76));
|
||||
ReservoirDownsampler<SAMRecord> downsampler = new ReservoirDownsampler<SAMRecord>(5);
|
||||
LegacyReservoirDownsampler<SAMRecord> downsampler = new LegacyReservoirDownsampler<SAMRecord>(5);
|
||||
downsampler.addAll(reads);
|
||||
|
||||
Assert.assertFalse(downsampler.isEmpty(),"Downsampler is empty but shouldn't be");
|
||||
|
|
@ -99,7 +99,7 @@ public class LegacyReservoirDownsamplerUnitTest {
|
|||
reads.add(ArtificialSAMUtils.createArtificialRead(header,"read1",0,1,76));
|
||||
reads.add(ArtificialSAMUtils.createArtificialRead(header,"read2",0,1,76));
|
||||
reads.add(ArtificialSAMUtils.createArtificialRead(header,"read3",0,1,76));
|
||||
ReservoirDownsampler<SAMRecord> downsampler = new ReservoirDownsampler<SAMRecord>(0);
|
||||
LegacyReservoirDownsampler<SAMRecord> downsampler = new LegacyReservoirDownsampler<SAMRecord>(0);
|
||||
downsampler.addAll(reads);
|
||||
|
||||
Assert.assertTrue(downsampler.isEmpty(),"Downsampler isn't empty but should be");
|
||||
|
|
@ -115,7 +115,7 @@ public class LegacyReservoirDownsamplerUnitTest {
|
|||
reads.add(ArtificialSAMUtils.createArtificialRead(header,"read3",0,1,76));
|
||||
reads.add(ArtificialSAMUtils.createArtificialRead(header,"read4",0,1,76));
|
||||
reads.add(ArtificialSAMUtils.createArtificialRead(header,"read5",0,1,76));
|
||||
ReservoirDownsampler<SAMRecord> downsampler = new ReservoirDownsampler<SAMRecord>(1);
|
||||
LegacyReservoirDownsampler<SAMRecord> downsampler = new LegacyReservoirDownsampler<SAMRecord>(1);
|
||||
downsampler.addAll(reads);
|
||||
|
||||
Assert.assertFalse(downsampler.isEmpty(),"Downsampler is empty but shouldn't be");
|
||||
|
|
@ -128,7 +128,7 @@ public class LegacyReservoirDownsamplerUnitTest {
|
|||
public void testFillingAcrossLoci() {
|
||||
List<SAMRecord> reads = new ArrayList<SAMRecord>();
|
||||
reads.add(ArtificialSAMUtils.createArtificialRead(header,"read1",0,1,76));
|
||||
ReservoirDownsampler<SAMRecord> downsampler = new ReservoirDownsampler<SAMRecord>(5);
|
||||
LegacyReservoirDownsampler<SAMRecord> downsampler = new LegacyReservoirDownsampler<SAMRecord>(5);
|
||||
downsampler.addAll(reads);
|
||||
|
||||
Assert.assertFalse(downsampler.isEmpty(),"Downsampler is empty but shouldn't be");
|
||||
|
|
|
|||
Loading…
Reference in New Issue