From a4334a67e088d9cd221dadc011edd1478dc7b28f Mon Sep 17 00:00:00 2001 From: Mark DePristo Date: Sat, 12 Jan 2013 19:22:36 -0500 Subject: [PATCH] SamplePartitioner optimizations and bugfixes -- Use a linked hash map instead of a hash map since we want to iterate through the map fairly often -- Ensure that we call doneSubmittingReads before getting reads for samples. This function call fell out before and since it wasn't enforced I only noticed the problem while writing comments -- Don't make unnecessary calls to contains for map. Just use get() and check that the result is null -- Use a LinkedList in PassThroughDownsampler, since this is faster for add() than the existing ArrayList, and we were's using random access to any resulting --- .../downsampling/PassThroughDownsampler.java | 14 +- .../utils/locusiterator/ReadStateManager.java | 10 +- .../locusiterator/SamplePartitioner.java | 124 +++++++++++++++--- 3 files changed, 122 insertions(+), 26 deletions(-) diff --git a/public/java/src/org/broadinstitute/sting/gatk/downsampling/PassThroughDownsampler.java b/public/java/src/org/broadinstitute/sting/gatk/downsampling/PassThroughDownsampler.java index 600834012..b06d5f5b4 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/downsampling/PassThroughDownsampler.java +++ b/public/java/src/org/broadinstitute/sting/gatk/downsampling/PassThroughDownsampler.java @@ -27,8 +27,8 @@ package org.broadinstitute.sting.gatk.downsampling; import net.sf.samtools.SAMRecord; -import java.util.ArrayList; import java.util.Collection; +import java.util.LinkedList; import java.util.List; /** @@ -41,7 +41,7 @@ import java.util.List; */ public class PassThroughDownsampler implements ReadsDownsampler { - private ArrayList selectedReads; + private LinkedList selectedReads; public PassThroughDownsampler() { clear(); @@ -59,9 +59,13 @@ public class PassThroughDownsampler implements ReadsDownsam } public boolean hasFinalizedItems() { - return selectedReads.size() > 0; + return ! selectedReads.isEmpty(); } + /** + * Note that this list is a linked list and so doesn't support fast random access + * @return + */ public List consumeFinalizedItems() { // pass by reference rather than make a copy, for speed List downsampledItems = selectedReads; @@ -74,7 +78,7 @@ public class PassThroughDownsampler implements ReadsDownsam } public T peekFinalized() { - return selectedReads.isEmpty() ? null : selectedReads.get(0); + return selectedReads.isEmpty() ? null : selectedReads.getFirst(); } public T peekPending() { @@ -90,7 +94,7 @@ public class PassThroughDownsampler implements ReadsDownsam } public void clear() { - selectedReads = new ArrayList(); + selectedReads = new LinkedList(); } public void reset() { diff --git a/public/java/src/org/broadinstitute/sting/utils/locusiterator/ReadStateManager.java b/public/java/src/org/broadinstitute/sting/utils/locusiterator/ReadStateManager.java index 955dbcef7..b5dbe2ddb 100644 --- a/public/java/src/org/broadinstitute/sting/utils/locusiterator/ReadStateManager.java +++ b/public/java/src/org/broadinstitute/sting/utils/locusiterator/ReadStateManager.java @@ -193,13 +193,15 @@ final class ReadStateManager implements Iterable newReads = samplePartitioner.getReadsForSample(sample); -// // if we're keeping reads, take the (potentially downsampled) list of new reads for this sample -// // and add to the list of reads. Note this may reorder the list of reads someone (it groups them -// // by sample, but it cannot change their absolute position on the genome as they all must -// // start at the current location + // if we're keeping reads, take the (potentially downsampled) list of new reads for this sample + // and add to the list of reads. Note this may reorder the list of reads someone (it groups them + // by sample, but it cannot change their absolute position on the genome as they all must + // start at the current location if ( keepSubmittedReads ) submittedReads.addAll(newReads); diff --git a/public/java/src/org/broadinstitute/sting/utils/locusiterator/SamplePartitioner.java b/public/java/src/org/broadinstitute/sting/utils/locusiterator/SamplePartitioner.java index 1653c6a92..7dada292b 100644 --- a/public/java/src/org/broadinstitute/sting/utils/locusiterator/SamplePartitioner.java +++ b/public/java/src/org/broadinstitute/sting/utils/locusiterator/SamplePartitioner.java @@ -25,6 +25,8 @@ package org.broadinstitute.sting.utils.locusiterator; +import com.google.java.contract.Ensures; +import com.google.java.contract.Requires; import net.sf.samtools.SAMRecord; import org.broadinstitute.sting.gatk.downsampling.Downsampler; import org.broadinstitute.sting.gatk.downsampling.PassThroughDownsampler; @@ -33,49 +35,137 @@ import org.broadinstitute.sting.gatk.downsampling.ReservoirDownsampler; import java.util.*; /** - * Divides reads by sample and (if requested) does a preliminary downsampling pass with a ReservoirDownsampler. + * Divides reads by sample and (if requested) does a preliminary downsampling pass + * with a ReservoirDownsampler. * * Note: stores reads by sample ID string, not by sample object */ class SamplePartitioner { - private Map> readsBySample; + /** + * Map from sample name (as a string) to a downsampler of reads for that sample + */ + final private Map> readsBySample; + /** + * Are we in a state where we're done submitting reads and have semi-finalized the + * underlying per sample downsampler? + */ + boolean doneSubmittingReads = false; + + /** + * Create a new SamplePartitioner capable of splitting reads up into buckets of reads for + * each sample in samples, and perform a preliminary downsampling of these reads + * (separately for each sample) if downsampling is requested in LIBSDownsamplingInfo + * + * Note that samples must be comprehensive, in that all reads every submitted to this + * partitioner must come from one of the samples provided here. If not, submitRead + * will throw an exception. Duplicates in the list of samples will be ignored + * + * @param LIBSDownsamplingInfo do we want to downsample, and if so to what coverage? + * @param samples the complete list of samples we're going to partition reads into + */ + @Ensures({ + "readsBySample != null", + "! readsBySample.isEmpty()", + "readsBySample.size() == new HashSet(samples).size()" + }) public SamplePartitioner(final LIBSDownsamplingInfo LIBSDownsamplingInfo, final List samples) { - readsBySample = new HashMap>(samples.size()); - for ( String sample : samples ) { + if ( LIBSDownsamplingInfo == null ) throw new IllegalArgumentException("LIBSDownsamplingInfo cannot be null"); + if ( samples == null || samples.isEmpty() ) throw new IllegalArgumentException("samples must be a non-null, non-empty list but got " + samples); + + readsBySample = new LinkedHashMap>(samples.size()); + for ( final String sample : samples ) { readsBySample.put(sample, createDownsampler(LIBSDownsamplingInfo)); } } + /** + * Create a new, ready to use downsampler based on the parameters in LIBSDownsamplingInfo + * @param LIBSDownsamplingInfo the parameters to use in creating the downsampler + * @return a downsampler appropriate for LIBSDownsamplingInfo. If no downsampling is requested, + * uses the PassThroughDownsampler, which does nothing at all. + */ + @Requires("LIBSDownsamplingInfo != null") + @Ensures("result != null") private Downsampler createDownsampler(final LIBSDownsamplingInfo LIBSDownsamplingInfo) { return LIBSDownsamplingInfo.isPerformDownsampling() ? new ReservoirDownsampler(LIBSDownsamplingInfo.getToCoverage()) : new PassThroughDownsampler(); } - public void submitRead(T read) { - String sampleName = read.getReadGroup() != null ? read.getReadGroup().getSample() : null; - if (readsBySample.containsKey(sampleName)) - readsBySample.get(sampleName).submit(read); + /** + * Offer this read to the partitioner, putting it into the bucket of reads for the sample + * of read (obtained via the read's read group). + * + * If the read group is missing, uses the special "null" read group + * + * @throws IllegalStateException if the sample of read wasn't present in the original + * set of samples provided to this SamplePartitioner at construction + * + * @param read the read to add to the sample's list of reads + */ + @Requires("read != null") + @Ensures("doneSubmittingReads == false") + public void submitRead(final T read) { + final String sampleName = read.getReadGroup() != null ? read.getReadGroup().getSample() : null; + final Downsampler downsampler = readsBySample.get(sampleName); + if ( downsampler == null ) + throw new IllegalStateException("Offered read with sample name " + sampleName + " to SamplePartitioner " + + "but this sample wasn't provided as one of possible samples at construction"); + + downsampler.submit(read); + doneSubmittingReads = false; } + /** + * Tell this partitioner that all reads in this cycle have been submitted, so that we + * can finalize whatever downsampling is required by each sample. + * + * Note that we *must* call this function before getReadsForSample, or else that + * function will exception out. + */ + @Ensures("doneSubmittingReads == true") public void doneSubmittingReads() { - for ( Map.Entry> perSampleReads : readsBySample.entrySet() ) { - perSampleReads.getValue().signalEndOfInput(); + for ( final Downsampler downsampler : readsBySample.values() ) { + downsampler.signalEndOfInput(); } + doneSubmittingReads = true; } - public Collection getReadsForSample(String sampleName) { - if ( ! readsBySample.containsKey(sampleName) ) - throw new NoSuchElementException("Sample name not found"); + /** + * Get the final collection of reads for this sample for this cycle + * + * The cycle is defined as all of the reads that occur between + * the first call to submitRead until doneSubmittingReads is called. At that + * point additional downsampling may occur (depending on construction arguments) + * and that set of reads is returned here. + * + * Note that this function can only be called once per cycle, as underlying + * collection of reads is cleared. + * + * @param sampleName the sample we want reads for, must be present in the original samples + * @return a non-null collection of reads for sample in this cycle + */ + @Ensures("result != null") + public Collection getReadsForSample(final String sampleName) { + if ( ! doneSubmittingReads ) throw new IllegalStateException("getReadsForSample called before doneSubmittingReads was called"); - return readsBySample.get(sampleName).consumeFinalizedItems(); + final Downsampler downsampler = readsBySample.get(sampleName); + if ( downsampler == null ) throw new NoSuchElementException("Sample name not found"); + + return downsampler.consumeFinalizedItems(); } + /** + * Resets this SamplePartitioner, indicating that we're starting a new + * cycle of adding reads to each underlying downsampler. + */ + @Ensures("doneSubmittingReads == false") public void reset() { - for ( Map.Entry> perSampleReads : readsBySample.entrySet() ) { - perSampleReads.getValue().clear(); - perSampleReads.getValue().reset(); + for ( final Downsampler downsampler : readsBySample.values() ) { + downsampler.clear(); + downsampler.reset(); } + doneSubmittingReads = false; } }