From 017ab6b690727535550018f3080e8c34e63e9264 Mon Sep 17 00:00:00 2001 From: hanna Date: Wed, 19 May 2010 05:40:05 +0000 Subject: [PATCH] Experimental versions of downsampler and Ryan's deduper are now available either as walker attributes or from the command-line. Not ready yet! Downsampling/deduping works in a general sense, but this approach has not been completely optimized or validated. Use with caution. git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@3392 348d0f76-0448-11de-a6fe-93d51630548a --- .../sting/gatk/DownsampleType.java | 15 +++ .../sting/gatk/DownsamplingMethod.java | 47 ++++++++ .../sting/gatk/GenomeAnalysisEngine.java | 12 +- .../org/broadinstitute/sting/gatk/Reads.java | 26 ++-- .../sting/gatk/WalkerManager.java | 21 ++++ .../arguments/GATKArgumentCollection.java | 11 +- .../gatk/datasources/providers/LocusView.java | 4 +- .../BlockDrivenSAMDataSource.java | 4 +- .../IndexDrivenSAMDataSource.java | 6 +- .../sting/gatk/executive/WindowMaker.java | 18 +-- .../DownsamplingLocusIteratorByState.java | 111 ++++++++---------- .../sting/gatk/walkers/Downsample.java | 22 ++++ .../sting/utils/ReservoirDownsampler.java | 2 +- .../GATKArgumentCollectionUnitTest.java | 1 - 14 files changed, 194 insertions(+), 106 deletions(-) create mode 100644 java/src/org/broadinstitute/sting/gatk/DownsampleType.java create mode 100644 java/src/org/broadinstitute/sting/gatk/DownsamplingMethod.java create mode 100644 java/src/org/broadinstitute/sting/gatk/walkers/Downsample.java diff --git a/java/src/org/broadinstitute/sting/gatk/DownsampleType.java b/java/src/org/broadinstitute/sting/gatk/DownsampleType.java new file mode 100644 index 000000000..552d7b6cb --- /dev/null +++ b/java/src/org/broadinstitute/sting/gatk/DownsampleType.java @@ -0,0 +1,15 @@ +package org.broadinstitute.sting.gatk; + +/** + * Type of downsampling method to invoke. + * + * @author hanna + * @version 0.1 + */ + +public enum DownsampleType { + NONE, + ALL_READS, + EXPERIMENTAL_BY_SAMPLE, + EXPERIMENTAL_NAIVE_DUPLICATE_ELIMINATOR +} diff --git a/java/src/org/broadinstitute/sting/gatk/DownsamplingMethod.java b/java/src/org/broadinstitute/sting/gatk/DownsamplingMethod.java new file mode 100644 index 000000000..1d132f03c --- /dev/null +++ b/java/src/org/broadinstitute/sting/gatk/DownsamplingMethod.java @@ -0,0 +1,47 @@ +package org.broadinstitute.sting.gatk; + +import org.broadinstitute.sting.utils.StingException; + +/** + * Describes the method for downsampling reads at a given locus. + * + * @author hanna + * @version 0.1 + */ + +public class DownsamplingMethod { + /** + * Type of downsampling to perform. + */ + public final DownsampleType type; + + /** + * Actual downsampling target is specified as an integer number of reads. + */ + public final Integer toCoverage; + + /** + * Actual downsampling target is specified as a fraction of total available reads. + */ + public final Double toFraction; + + public DownsamplingMethod(DownsampleType type, Integer toCoverage, Double toFraction) { + // Do some basic sanity checks on the downsampling parameters passed in. + + // Can't leave toFraction and toCoverage null unless type is experimental naive duplicate eliminator. + if(type != DownsampleType.NONE && type != DownsampleType.EXPERIMENTAL_NAIVE_DUPLICATE_ELIMINATOR && toFraction == null && toCoverage == null) + throw new StingException("Must specify either toFraction or toCoverage when downsampling."); + + // Fraction and coverage cannot both be specified. + if(toFraction != null && toCoverage != null) + throw new StingException("Downsampling coverage and fraction are both specified. Please choose only one."); + + // Experimental by sample downsampling does not work with a fraction of reads. + if(type == DownsampleType.EXPERIMENTAL_BY_SAMPLE && toFraction != null) + throw new StingException("Cannot downsample to fraction with new EXPERIMENTAL_BY_SAMPLE method"); + + this.type = type; + this.toCoverage = toCoverage; + this.toFraction = toFraction; + } +} diff --git a/java/src/org/broadinstitute/sting/gatk/GenomeAnalysisEngine.java b/java/src/org/broadinstitute/sting/gatk/GenomeAnalysisEngine.java index 0747f87cb..2b0a7e2db 100755 --- a/java/src/org/broadinstitute/sting/gatk/GenomeAnalysisEngine.java +++ b/java/src/org/broadinstitute/sting/gatk/GenomeAnalysisEngine.java @@ -524,10 +524,18 @@ public class GenomeAnalysisEngine { * @return The reads object providing reads source info. */ private Reads extractSourceInfo(Walker walker, Collection filters, GATKArgumentCollection argCollection) { + + DownsamplingMethod method = null; + if(argCollection.downsamplingType != DownsampleType.NONE) + method = new DownsamplingMethod(argCollection.downsamplingType,argCollection.downsampleCoverage,argCollection.downsampleFraction); + else if(WalkerManager.getDownsamplingMethod(walker) != null) + method = WalkerManager.getDownsamplingMethod(walker); + else + method = new DownsamplingMethod(DownsampleType.NONE,null,null); + return new Reads(argCollection.samFiles, argCollection.strictnessLevel, - argCollection.downsampleFraction, - argCollection.downsampleCoverage, + method, new ValidationExclusion(Arrays.asList(argCollection.unsafe)), filters, argCollection.readMaxPileup, diff --git a/java/src/org/broadinstitute/sting/gatk/Reads.java b/java/src/org/broadinstitute/sting/gatk/Reads.java index c1b30a860..0135cfb20 100755 --- a/java/src/org/broadinstitute/sting/gatk/Reads.java +++ b/java/src/org/broadinstitute/sting/gatk/Reads.java @@ -28,8 +28,7 @@ import java.util.Collection; public class Reads { private List readsFiles = null; private SAMFileReader.ValidationStringency validationStringency = SAMFileReader.ValidationStringency.STRICT; - private Double downsamplingFraction = null; - private Integer downsampleToCoverage = null; + private DownsamplingMethod downsamplingMethod = null; private ValidationExclusion exclusionList = null; private Collection supplementalFilters = null; protected int maximumReadsAtLocus = Integer.MAX_VALUE; // this should always be set, so we'll default it MAX_INT @@ -76,19 +75,11 @@ public class Reads { } /** - * Get the fraction of reads to downsample. + * Gets the method and parameters used when downsampling reads. * @return Downsample fraction. */ - public Double getDownsamplingFraction() { - return downsamplingFraction; - } - - /** - * Downsample each locus to the specified coverage. - * @return Coverage to which to downsample. - */ - public Integer getDownsampleToCoverage() { - return downsampleToCoverage; + public DownsamplingMethod getDownsamplingMethod() { + return downsamplingMethod; } /** @@ -117,6 +108,7 @@ public class Reads { */ public Reads( List readsFiles ) { this.readsFiles = readsFiles; + this.downsamplingMethod = new DownsamplingMethod(DownsampleType.NONE,null,null); this.supplementalFilters = new ArrayList(); this.exclusionList = new ValidationExclusion(); } @@ -127,8 +119,6 @@ public class Reads { * is package protected. * @param samFiles list of reads files. * @param strictness Stringency of reads file parsing. - * @param downsampleFraction fraction of reads to downsample. - * @param downsampleCoverage downsampling per-locus. * @param exclusionList what safety checks we're willing to let slide * @param supplementalFilters additional filters to dynamically apply. * @param generateExtendedEvents if true, the engine will issue an extra call to walker's map() with @@ -140,8 +130,7 @@ public class Reads { */ Reads( List samFiles, SAMFileReader.ValidationStringency strictness, - Double downsampleFraction, - Integer downsampleCoverage, + DownsamplingMethod downsamplingMethod, ValidationExclusion exclusionList, Collection supplementalFilters, int maximumReadsAtLocus, @@ -149,8 +138,7 @@ public class Reads { boolean generateExtendedEvents) { this.readsFiles = samFiles; this.validationStringency = strictness; - this.downsamplingFraction = downsampleFraction; - this.downsampleToCoverage = downsampleCoverage; + this.downsamplingMethod = downsamplingMethod; this.exclusionList = exclusionList == null ? new ValidationExclusion() : exclusionList; this.supplementalFilters = supplementalFilters; this.maximumReadsAtLocus = maximumReadsAtLocus; diff --git a/java/src/org/broadinstitute/sting/gatk/WalkerManager.java b/java/src/org/broadinstitute/sting/gatk/WalkerManager.java index ab2253933..5c0f76f3a 100755 --- a/java/src/org/broadinstitute/sting/gatk/WalkerManager.java +++ b/java/src/org/broadinstitute/sting/gatk/WalkerManager.java @@ -238,6 +238,27 @@ public class WalkerManager extends PluginManager { return filters; } + /** + * Gets the type of downsampling method requested by the walker. If an alternative + * downsampling method is specified on the command-line, the command-line version will + * be used instead. + * @param walker The walker to interrogate. + * @return The downsampling method, as specified by the walker. Null if none exists. + */ + public static DownsamplingMethod getDownsamplingMethod(Walker walker) { + DownsamplingMethod downsamplingMethod = null; + + if( walker.getClass().isAnnotationPresent(Downsample.class) ) { + Downsample downsampleParameters = walker.getClass().getAnnotation(Downsample.class); + DownsampleType type = downsampleParameters.by(); + Integer toCoverage = downsampleParameters.toCoverage() >= 0 ? downsampleParameters.toCoverage() : null; + Double toFraction = downsampleParameters.toFraction() >= 0.0d ? downsampleParameters.toFraction() : null; + downsamplingMethod = new DownsamplingMethod(type,toCoverage,toFraction); + } + + return downsamplingMethod; + } + /** * Create a name for this type of walker. * diff --git a/java/src/org/broadinstitute/sting/gatk/arguments/GATKArgumentCollection.java b/java/src/org/broadinstitute/sting/gatk/arguments/GATKArgumentCollection.java index beb5dc49d..ac506fa4a 100755 --- a/java/src/org/broadinstitute/sting/gatk/arguments/GATKArgumentCollection.java +++ b/java/src/org/broadinstitute/sting/gatk/arguments/GATKArgumentCollection.java @@ -29,6 +29,7 @@ import net.sf.samtools.SAMFileReader; import org.broadinstitute.sting.utils.StingException; import org.broadinstitute.sting.utils.interval.IntervalMergingRule; import org.broadinstitute.sting.commandline.Argument; +import org.broadinstitute.sting.gatk.DownsampleType; import org.simpleframework.xml.*; import org.simpleframework.xml.core.Persister; import org.simpleframework.xml.stream.Format; @@ -124,6 +125,10 @@ public class GATKArgumentCollection { @Argument(fullName = "filterZeroMappingQualityReads", shortName = "fmq0", doc = "If true, mapping quality zero reads will be filtered at the lowest GATK level. Vastly improves performance at areas with abnormal depth due to mapping Q0 reads", required = false) public Boolean filterZeroMappingQualityReads = false; + @Element(required = false) + @Argument(fullName = "downsampling_type", shortName="dt", doc="Type of reads downsampling to employ at a given locus. Reads will be selected randomly to be removed from the pile based on the method described here.", required = false) + public DownsampleType downsamplingType = DownsampleType.NONE; + @Element(required = false) @Argument(fullName = "downsample_to_fraction", shortName = "dfrac", doc = "Fraction [0.0-1.0] of reads to downsample to", required = false) public Double downsampleFraction = null; @@ -148,13 +153,9 @@ public class GATKArgumentCollection { @Argument(fullName = "max_reads_at_locus", shortName = "mrl", doc = "Sets the upper limit for the number of reads presented at a single locus; use this argument if you are running into memory issues resulting from too many reads piled up at a given locus (but use downsample_to_coverage instead if you are trying to downsample); int.MAX_VALUE by default.", required = false) public int readMaxPileup = Integer.MAX_VALUE; - @Element(required = false) - @Argument(fullName = "disablethreading", shortName = "dt", doc = "Disable experimental threading support.", required = false) - public Boolean disableThreading = false; - /** How many threads should be allocated to this analysis. */ @Element(required = false) - @Argument(fullName = "numthreads", shortName = "nt", doc = "How many threads should be allocated to running this analysis.", required = false) + @Argument(fullName = "num_threads", shortName = "nt", doc = "How many threads should be allocated to running this analysis.", required = false) public int numberOfThreads = 1; /** What rule should we use when merging intervals */ diff --git a/java/src/org/broadinstitute/sting/gatk/datasources/providers/LocusView.java b/java/src/org/broadinstitute/sting/gatk/datasources/providers/LocusView.java index faa3ca318..eaf341fc3 100755 --- a/java/src/org/broadinstitute/sting/gatk/datasources/providers/LocusView.java +++ b/java/src/org/broadinstitute/sting/gatk/datasources/providers/LocusView.java @@ -127,8 +127,8 @@ public abstract class LocusView extends LocusIterator implements View { // Find the next. seedNextLocus(); - if( sourceInfo.getDownsampleToCoverage() != null ) - current.downsampleToCoverage( sourceInfo.getDownsampleToCoverage() ); + if( sourceInfo.getDownsamplingMethod().toCoverage != null ) + current.downsampleToCoverage( sourceInfo.getDownsamplingMethod().toCoverage ); // if the current loci isn't null, get the overflow tracker and pass it to the alignment context if ((this.loci != null)) diff --git a/java/src/org/broadinstitute/sting/gatk/datasources/simpleDataSources/BlockDrivenSAMDataSource.java b/java/src/org/broadinstitute/sting/gatk/datasources/simpleDataSources/BlockDrivenSAMDataSource.java index fe1c4ff37..3d116e25a 100644 --- a/java/src/org/broadinstitute/sting/gatk/datasources/simpleDataSources/BlockDrivenSAMDataSource.java +++ b/java/src/org/broadinstitute/sting/gatk/datasources/simpleDataSources/BlockDrivenSAMDataSource.java @@ -261,7 +261,7 @@ public class BlockDrivenSAMDataSource extends SAMDataSource { return applyDecoratingIterators(enableVerification, new ReleasingIterator(readers,StingSAMIteratorAdapter.adapt(reads,mergingIterator)), - reads.getDownsamplingFraction(), + reads.getDownsamplingMethod().toFraction, reads.getValidationExclusionList().contains(ValidationExclusion.TYPE.NO_READ_ORDER_VERIFICATION), reads.getSupplementalFilters()); } @@ -282,7 +282,7 @@ public class BlockDrivenSAMDataSource extends SAMDataSource { return applyDecoratingIterators(shard instanceof ReadShard, new ReleasingIterator(readers,StingSAMIteratorAdapter.adapt(reads,mergingIterator)), - reads.getDownsamplingFraction(), + reads.getDownsamplingMethod().toFraction, reads.getValidationExclusionList().contains(ValidationExclusion.TYPE.NO_READ_ORDER_VERIFICATION), reads.getSupplementalFilters()); } diff --git a/java/src/org/broadinstitute/sting/gatk/datasources/simpleDataSources/IndexDrivenSAMDataSource.java b/java/src/org/broadinstitute/sting/gatk/datasources/simpleDataSources/IndexDrivenSAMDataSource.java index 295b9d849..e34f64cdc 100644 --- a/java/src/org/broadinstitute/sting/gatk/datasources/simpleDataSources/IndexDrivenSAMDataSource.java +++ b/java/src/org/broadinstitute/sting/gatk/datasources/simpleDataSources/IndexDrivenSAMDataSource.java @@ -165,14 +165,14 @@ public class IndexDrivenSAMDataSource extends SAMDataSource { iterator = seekRead(shard); iterator = applyDecoratingIterators(true, iterator, - reads.getDownsamplingFraction(), + reads.getDownsamplingMethod().toFraction, reads.getValidationExclusionList().contains(ValidationExclusion.TYPE.NO_READ_ORDER_VERIFICATION), reads.getSupplementalFilters()); } else if (shard.getShardType() == Shard.ShardType.LOCUS) { iterator = seekLocus(shard); iterator = applyDecoratingIterators(false, iterator, - reads.getDownsamplingFraction(), + reads.getDownsamplingMethod().toFraction, reads.getValidationExclusionList().contains(ValidationExclusion.TYPE.NO_READ_ORDER_VERIFICATION), reads.getSupplementalFilters()); } else if ((shard.getShardType() == Shard.ShardType.LOCUS_INTERVAL) || @@ -180,7 +180,7 @@ public class IndexDrivenSAMDataSource extends SAMDataSource { iterator = seekLocus(shard); iterator = applyDecoratingIterators(false, iterator, - reads.getDownsamplingFraction(), + reads.getDownsamplingMethod().toFraction, reads.getValidationExclusionList().contains(ValidationExclusion.TYPE.NO_READ_ORDER_VERIFICATION), reads.getSupplementalFilters()); diff --git a/java/src/org/broadinstitute/sting/gatk/executive/WindowMaker.java b/java/src/org/broadinstitute/sting/gatk/executive/WindowMaker.java index a6945e2b3..d40078803 100644 --- a/java/src/org/broadinstitute/sting/gatk/executive/WindowMaker.java +++ b/java/src/org/broadinstitute/sting/gatk/executive/WindowMaker.java @@ -1,11 +1,9 @@ package org.broadinstitute.sting.gatk.executive; import org.broadinstitute.sting.utils.GenomeLoc; -import org.broadinstitute.sting.gatk.iterators.StingSAMIterator; -import org.broadinstitute.sting.gatk.iterators.LocusIterator; -import org.broadinstitute.sting.gatk.iterators.LocusIteratorByState; -import org.broadinstitute.sting.gatk.iterators.LocusOverflowTracker; +import org.broadinstitute.sting.gatk.iterators.*; import org.broadinstitute.sting.gatk.Reads; +import org.broadinstitute.sting.gatk.DownsampleType; import org.broadinstitute.sting.gatk.traversals.TraversalStatistics; import org.broadinstitute.sting.gatk.contexts.AlignmentContext; @@ -57,15 +55,21 @@ public class WindowMaker implements Iterable, I /** * Create a new window maker with the given iterator as a data source, covering - * the given inteervals. + * the given intervals. * @param iterator The data source for this window. * @param intervals The set of intervals over which to traverse. */ public WindowMaker(StingSAMIterator iterator, List intervals) { this.sourceInfo = iterator.getSourceInfo(); this.readIterator = iterator; - - LocusIterator locusIterator = new LocusIteratorByState(new FilteringIterator(iterator,new LocusStreamFilterFunc()),sourceInfo); + + LocusIterator locusIterator; + if(sourceInfo.getDownsamplingMethod() != null && + (sourceInfo.getDownsamplingMethod().type == DownsampleType.EXPERIMENTAL_BY_SAMPLE || sourceInfo.getDownsamplingMethod().type == DownsampleType.EXPERIMENTAL_NAIVE_DUPLICATE_ELIMINATOR)) + locusIterator = new DownsamplingLocusIteratorByState(new FilteringIterator(iterator,new LocusStreamFilterFunc()),sourceInfo); + else + locusIterator = new LocusIteratorByState(new FilteringIterator(iterator,new LocusStreamFilterFunc()),sourceInfo); + this.locusOverflowTracker = locusIterator.getLocusOverflowTracker(); this.sourceIterator = new PeekableIterator(locusIterator); diff --git a/java/src/org/broadinstitute/sting/gatk/iterators/DownsamplingLocusIteratorByState.java b/java/src/org/broadinstitute/sting/gatk/iterators/DownsamplingLocusIteratorByState.java index 7dd44b7f4..76e9e6347 100755 --- a/java/src/org/broadinstitute/sting/gatk/iterators/DownsamplingLocusIteratorByState.java +++ b/java/src/org/broadinstitute/sting/gatk/iterators/DownsamplingLocusIteratorByState.java @@ -30,6 +30,8 @@ import net.sf.picard.util.PeekableIterator; import org.apache.log4j.Logger; import org.broadinstitute.sting.gatk.Reads; import org.broadinstitute.sting.gatk.GenomeAnalysisEngine; +import org.broadinstitute.sting.gatk.DownsamplingMethod; +import org.broadinstitute.sting.gatk.DownsampleType; import org.broadinstitute.sting.gatk.contexts.AlignmentContext; import org.broadinstitute.sting.utils.*; import org.broadinstitute.sting.utils.pileup.PileupElement; @@ -259,7 +261,7 @@ public class DownsamplingLocusIteratorByState extends LocusIterator { // TODO: Push in header via constructor if(GenomeAnalysisEngine.instance.getDataSource() != null) sampleNames.addAll(SampleUtils.getSAMFileSamples(GenomeAnalysisEngine.instance.getSAMFileHeader())); - readStates = new ReadStateManager(samIterator,sampleNames,readInformation.getMaxReadsAtLocus()); + readStates = new ReadStateManager(samIterator,readInformation.getDownsamplingMethod(),sampleNames); this.readInfo = readInformation; } @@ -507,8 +509,10 @@ public class DownsamplingLocusIteratorByState extends LocusIterator { private class ReadStateManager implements Iterable { private final PeekableIterator iterator; + private final DownsamplingMethod downsamplingMethod; + private final Map> downsamplersBySampleName = new HashMap>(); - private final int maxReadsPerSample; + private final int targetCoverage; private final Deque>> readStatesByAlignmentStart; @@ -519,11 +523,16 @@ public class DownsamplingLocusIteratorByState extends LocusIterator { */ private Random downsampleRandomizer = new Random(38148309L); - public ReadStateManager(Iterator source, Collection sampleNames, int maxReadsPerSample) { + public ReadStateManager(Iterator source, DownsamplingMethod downsamplingMethod, Collection sampleNames) { this.iterator = new PeekableIterator(source); - this.maxReadsPerSample = maxReadsPerSample; - for(String sampleName: sampleNames) - downsamplersBySampleName.put(sampleName,new ReservoirDownsampler(maxReadsPerSample)); + this.downsamplingMethod = downsamplingMethod; + this.targetCoverage = downsamplingMethod.toCoverage != null ? downsamplingMethod.toCoverage : 1; + if(downsamplingMethod.type == DownsampleType.EXPERIMENTAL_NAIVE_DUPLICATE_ELIMINATOR) + downsamplersBySampleName.put(null,new ReservoirDownsampler(targetCoverage)); + else { + for(String sampleName: sampleNames) + downsamplersBySampleName.put(sampleName,new ReservoirDownsampler(targetCoverage)); + } this.readStatesByAlignmentStart = new LinkedList>>(); } @@ -606,9 +615,23 @@ public class DownsamplingLocusIteratorByState extends LocusIterator { } public void collectPendingReads() { - while (iterator.hasNext() && !readIsPastCurrentPosition(iterator.peek())) { - SAMRecord read = iterator.next(); - downsamplersBySampleName.get(read.getReadGroup().getSample()).add(read); + if(iterator.hasNext() && readStates.size() == 0) { + int firstContigIndex = iterator.peek().getReferenceIndex(); + int firstAlignmentStart = iterator.peek().getAlignmentStart(); + while(iterator.hasNext() && iterator.peek().getReferenceIndex() == firstContigIndex && iterator.peek().getAlignmentStart() == firstAlignmentStart) { + SAMRecord read = iterator.next(); + getDownsampler(read.getReadGroup().getSample()).add(read); + } + } + else { + // Fast fail in the case that the read is past the current position. + if(iterator.hasNext() && readIsPastCurrentPosition(iterator.peek())) + return; + + while (iterator.hasNext() && !readIsPastCurrentPosition(iterator.peek())) { + SAMRecord read = iterator.next(); + getDownsampler(read.getReadGroup().getSample()).add(read); + } } Map> culledReadStatesBySample = new HashMap>(); @@ -621,14 +644,14 @@ public class DownsamplingLocusIteratorByState extends LocusIterator { downsampler.clear(); int readsInHanger = countReadsInHanger(sampleName); - if(readsInHanger+newReads.size() <= maxReadsPerSample) + if(readsInHanger+newReads.size()<=targetCoverage || downsamplingMethod.type==DownsampleType.EXPERIMENTAL_NAIVE_DUPLICATE_ELIMINATOR) addReadsToHanger(culledReadStatesBySample,sampleName,newReads,newReads.size()); else { Iterator>> backIterator = readStatesByAlignmentStart.descendingIterator(); boolean readPruned = true; - while(readsInHanger+newReads.size()>maxReadsPerSample && readPruned) { + while(readsInHanger+newReads.size()>targetCoverage && readPruned) { readPruned = false; - while(readsInHanger+newReads.size()>maxReadsPerSample && backIterator.hasNext()) { + while(readsInHanger+newReads.size()>targetCoverage && backIterator.hasNext()) { List readsAtLocus = backIterator.next().get(sampleName); if(readsAtLocus.size() > 1) { readsAtLocus.remove(downsampleRandomizer.nextInt(readsAtLocus.size())); @@ -638,65 +661,24 @@ public class DownsamplingLocusIteratorByState extends LocusIterator { } } - if(readsInHanger == maxReadsPerSample) { + if(readsInHanger == targetCoverage) { Collection firstHangerForSample = readStatesByAlignmentStart.getFirst().get(sampleName); readsInHanger -= firstHangerForSample.size(); firstHangerForSample.clear(); } - addReadsToHanger(culledReadStatesBySample,sampleName,newReads,maxReadsPerSample-readsInHanger); + addReadsToHanger(culledReadStatesBySample,sampleName,newReads,targetCoverage-readsInHanger); } readStatesByAlignmentStart.add(culledReadStatesBySample); } + } -/* else { - if() { - // Consume the collection of reads. - downsamplingIterator.next(); - - Map> newReadsBySample = new HashMap>(); - Map> culledReadStatesBySample = new HashMap>(); - - for(String sampleName: sampleNames) - newReadsBySample.put(sampleName,getReadsForGivenSample(reads,sampleName)); - - for(String sampleName: newReadsBySample.keySet()) { - Collection newReads = newReadsBySample.get(sampleName); - int readsInHanger = countReadsInHanger(sampleName); - - //if(readsInHanger+newReads.size() <= maxReadsPerSample) - addReadsToHanger(culledReadStatesBySample,sampleName,newReads,newReads.size()); - Iterator>> backIterator = readStatesByAlignmentStart.descendingIterator(); - boolean readPruned = true; - while(readsInHanger+newReads.size()>maxReadsPerSample && readPruned) { - readPruned = false; - while(readsInHanger+newReads.size()>maxReadsPerSample && backIterator.hasNext()) { - List readsAtLocus = backIterator.next().get(sampleName); - if(readsAtLocus.size() > 1) { - readsAtLocus.remove(downsampleRandomizer.nextInt(readsAtLocus.size())); - readPruned = true; - readsInHanger--; - } - } - } - - if(readsInHanger == maxReadsPerSample) { - Collection firstHangerForSample = readStatesByAlignmentStart.getFirst().get(sampleName); - readsInHanger -= firstHangerForSample.size(); - firstHangerForSample.clear(); - } - - addReadsToHanger(culledReadStatesBySample,sampleName,newReads,maxReadsPerSample-readsInHanger); - } - } - - readStatesByAlignmentStart.add(culledReadStatesBySample); - } - else if(readIsPastCurrentPosition(reads.iterator().next())) - break; - } -*/ + private ReservoirDownsampler getDownsampler(String sampleName) { + if(downsamplingMethod.type == DownsampleType.EXPERIMENTAL_NAIVE_DUPLICATE_ELIMINATOR) + return downsamplersBySampleName.get(null); + else + return downsamplersBySampleName.get(sampleName); } private int countReadsInHanger() { @@ -737,9 +719,10 @@ public class DownsamplingLocusIteratorByState extends LocusIterator { Iterator>> hangerIterator = readStatesByAlignmentStart.iterator(); while(hangerIterator.hasNext()) { Map> hangerEntry = hangerIterator.next(); - for(String sampleName: sampleNames) { - if(hangerEntry.containsKey(sampleName) && hangerEntry.get(sampleName).size() == 0) - hangerEntry.remove(sampleName); + Iterator> entryBySampleIterator = hangerEntry.values().iterator(); + while(entryBySampleIterator.hasNext()) { + if(entryBySampleIterator.next().size() == 0) + entryBySampleIterator.remove(); } if(hangerEntry.size() == 0) hangerIterator.remove(); diff --git a/java/src/org/broadinstitute/sting/gatk/walkers/Downsample.java b/java/src/org/broadinstitute/sting/gatk/walkers/Downsample.java new file mode 100644 index 000000000..d662b0092 --- /dev/null +++ b/java/src/org/broadinstitute/sting/gatk/walkers/Downsample.java @@ -0,0 +1,22 @@ +package org.broadinstitute.sting.gatk.walkers; + +import org.broadinstitute.sting.gatk.DownsampleType; + +import java.lang.annotation.*; + +/** + * Specifies a method for downsampling the reads passed to a given + * walker based on the input from that walker. + * + * @author hanna + * @version 0.1 + */ +@Documented +@Inherited +@Retention(RetentionPolicy.RUNTIME) +@Target(ElementType.TYPE) +public @interface Downsample { + DownsampleType by(); + int toCoverage() default -1; + double toFraction() default -1.0F; +} \ No newline at end of file diff --git a/java/src/org/broadinstitute/sting/utils/ReservoirDownsampler.java b/java/src/org/broadinstitute/sting/utils/ReservoirDownsampler.java index ea4b848c0..51a1775bf 100644 --- a/java/src/org/broadinstitute/sting/utils/ReservoirDownsampler.java +++ b/java/src/org/broadinstitute/sting/utils/ReservoirDownsampler.java @@ -73,7 +73,7 @@ public class ReservoirDownsampler implements Collection { * @return The downsampled contents of this reservoir. */ public Collection getDownsampledContents() { - return Collections.unmodifiableCollection(reservoir); + return (Collection)reservoir.clone(); } @Override diff --git a/java/test/org/broadinstitute/sting/gatk/arguments/GATKArgumentCollectionUnitTest.java b/java/test/org/broadinstitute/sting/gatk/arguments/GATKArgumentCollectionUnitTest.java index df76b22e0..f98175004 100755 --- a/java/test/org/broadinstitute/sting/gatk/arguments/GATKArgumentCollectionUnitTest.java +++ b/java/test/org/broadinstitute/sting/gatk/arguments/GATKArgumentCollectionUnitTest.java @@ -92,7 +92,6 @@ public class GATKArgumentCollectionUnitTest extends BaseTest { collect.intervals = new ArrayList(); collect.intervals.add("intervals".toLowerCase()); collect.excludeIntervals = new ArrayList(); - collect.disableThreading = false; collect.outFileName = "outFileName".toLowerCase(); collect.errFileName = "errFileName".toLowerCase(); collect.outErrFileName = "outErrFileName".toLowerCase();