From c7f0ca8ac53e320d2762da917158be51a9b2d8ae Mon Sep 17 00:00:00 2001 From: Mark DePristo Date: Sun, 13 Jan 2013 14:36:25 -0500 Subject: [PATCH] Optimization for LIBS: PerSampleReadStateManager now uses a simple LinkedList of AlignmentStateMachine -- Instead of storing a list of list of alignment starts, which is expensive to manipulate, we instead store a linear list of alignment starts. Not grouped as previously. This enables us to simplify iteration and update operations, making them much faster -- Critically, the downsampler still requires this list of list. We convert back and forth between these two representations as required, which is very rarely for normal data sets (WGS NA12878 on chr20 is 0.2%, 4x WGS is even less). --- .../PerSampleReadStateManager.java | 170 ++++++++++++------ .../utils/locusiterator/ReadStateManager.java | 2 +- 2 files changed, 115 insertions(+), 57 deletions(-) diff --git a/public/java/src/org/broadinstitute/sting/utils/locusiterator/PerSampleReadStateManager.java b/public/java/src/org/broadinstitute/sting/utils/locusiterator/PerSampleReadStateManager.java index c2a47bbdb..3f3bc706f 100644 --- a/public/java/src/org/broadinstitute/sting/utils/locusiterator/PerSampleReadStateManager.java +++ b/public/java/src/org/broadinstitute/sting/utils/locusiterator/PerSampleReadStateManager.java @@ -26,6 +26,7 @@ package org.broadinstitute.sting.utils.locusiterator; import com.google.java.contract.Ensures; +import com.google.java.contract.Invariant; import com.google.java.contract.Requires; import net.sf.samtools.CigarOperator; import org.apache.log4j.Logger; @@ -43,18 +44,42 @@ import java.util.List; * Date: 1/13/13 * Time: 12:28 PM */ +@Invariant({ + "readStartsAreWellOrdered()", + "! isDownsampling() || downsamplingTarget > 0", + "nSites >= 0", + "nSitesNeedingDownsampling >= 0", + "nSitesNeedingDownsampling <= nSites" +}) final class PerSampleReadStateManager implements Iterable { private final static Logger logger = Logger.getLogger(ReadStateManager.class); - private final static boolean CAPTURE_DOWNSAMPLING_STATS = true; + private final static boolean CAPTURE_DOWNSAMPLING_STATS = false; + + /** + * A list (potentially empty) of alignment state machines. + * + * The state machines must be ordered by the alignment start of their underlying reads, with the + * lowest alignment starts on the left, and the largest on the right + */ + private LinkedList readStatesByAlignmentStart = new LinkedList(); - private List> readStatesByAlignmentStart = new LinkedList>(); private final Downsampler> levelingDownsampler; - private int thisSampleReadStates = 0; - private final int downsamplingTarget; + + /** + * The number of sites where downsampling has been invoked + */ private int nSitesNeedingDownsampling = 0; + + /** + * The number of sites we've visited + */ private int nSites = 0; + /** + * Create a new PerSampleReadStateManager with downsampling parameters as requested by LIBSDownsamplingInfo + * @param LIBSDownsamplingInfo the downsampling params we want to use + */ public PerSampleReadStateManager(final LIBSDownsamplingInfo LIBSDownsamplingInfo) { this.downsamplingTarget = LIBSDownsamplingInfo.isPerformDownsampling() ? LIBSDownsamplingInfo.getToCoverage() : -1; this.levelingDownsampler = LIBSDownsamplingInfo.isPerformDownsampling() @@ -62,55 +87,118 @@ final class PerSampleReadStateManager implements Iterable : null; } + /** + * Group the underlying readStatesByAlignmentStart into a list of list of alignment state machines, + * where each list contains machines with a unique genome site. The outer list is ordered + * by alignment start. + * + * For example, if the flat list has alignment starts [10, 10, 11, 12, 12, 13] then + * the resulting grouping will be [[10, 10], [11], [12, 12], [13]]. + * + * @return a non-null list of lists + */ + @Ensures("result != null") + private List> groupByAlignmentStart() { + final LinkedList> grouped = new LinkedList>(); + + AlignmentStateMachine last = null; + for ( final AlignmentStateMachine stateMachine : readStatesByAlignmentStart ) { + if ( last == null || stateMachine.getGenomeOffset() != last.getGenomeOffset() ) { + // we've advanced to a place where the state machine has a different state, + // so start a new list + grouped.add(new LinkedList()); + last = stateMachine; + } + grouped.getLast().add(stateMachine); + } + + return grouped; + } + + /** + * Flattens the grouped list of list of alignment state machines into a single list in order + * @return a non-null list contains the state machines + */ + @Ensures("result != null") + private LinkedList flattenByAlignmentStart(final List> grouped) { + final LinkedList flat = new LinkedList(); + for ( final List l : grouped ) + flat.addAll(l); + return flat; + } + + /** + * Test that the reads are ordered by their alignment starts + * @return true if well ordered, false otherwise + */ + private boolean readStartsAreWellOrdered() { + int lastStart = -1; + for ( final AlignmentStateMachine machine : readStatesByAlignmentStart ) { + if ( lastStart > machine.getRead().getAlignmentStart() ) + return false; + lastStart = machine.getRead().getAlignmentStart(); + } + return true; + } + /** * Assumes it can just keep the states linked lists without making a copy * @param states the new states to add to this manager - * @return The change in the number of states, after including states and potentially downsampling + * @return The change in the number of states, after including states and potentially downsampling. Note + * that this return result might be negative, if downsampling is enabled, as we might drop + * more sites than have been added by the downsampler */ @Requires("states != null") - @Ensures("result >= 0") - public int addStatesAtNextAlignmentStart(LinkedList states) { + public int addStatesAtNextAlignmentStart(final LinkedList states) { if ( states.isEmpty() ) { return 0; } - readStatesByAlignmentStart.add(states); + readStatesByAlignmentStart.addAll(states); int nStatesAdded = states.size(); - if ( isDownsampling() ) { + if ( isDownsampling() && readStatesByAlignmentStart.size() > downsamplingTarget ) { + // only go into the downsampling branch if we are downsampling and the coverage > the target captureDownsamplingStats(); - levelingDownsampler.submit(readStatesByAlignmentStart); + levelingDownsampler.submit(groupByAlignmentStart()); levelingDownsampler.signalEndOfInput(); nStatesAdded -= levelingDownsampler.getNumberOfDiscardedItems(); // use returned List directly rather than make a copy, for efficiency's sake - readStatesByAlignmentStart = levelingDownsampler.consumeFinalizedItems(); + readStatesByAlignmentStart = flattenByAlignmentStart(levelingDownsampler.consumeFinalizedItems()); levelingDownsampler.reset(); } - thisSampleReadStates += nStatesAdded; return nStatesAdded; } + /** + * Is downsampling enabled for this manager? + * @return true if we are downsampling, false otherwise + */ private boolean isDownsampling() { return levelingDownsampler != null; } - private AlignmentStateMachine getFirst() { - if (readStatesByAlignmentStart.isEmpty()) - return null; - else - return readStatesByAlignmentStart.get(0).getFirst(); + /** + * Get the leftmost alignment state machine, or null if the read states is empty + * @return a potentially null AlignmentStateMachine + */ + public AlignmentStateMachine getFirst() { + return isEmpty() ? null : readStatesByAlignmentStart.getFirst(); } + /** + * Capture some statistics about the behavior of the downsampling, but only if CAPTURE_DOWNSAMPLING_STATS is true + */ @Requires("isDownsampling()") private void captureDownsamplingStats() { if ( CAPTURE_DOWNSAMPLING_STATS ) { nSites++; final int loc = getFirst().getGenomePosition(); String message = "Pass through"; - final boolean downsampling = thisSampleReadStates > downsamplingTarget; + final boolean downsampling = size() > downsamplingTarget; if ( downsampling ) { nSitesNeedingDownsampling++; message = "Downsampling"; @@ -118,7 +206,7 @@ final class PerSampleReadStateManager implements Iterable if ( downsampling || nSites % 10000 == 0 ) logger.info(String.format("%20s at %s: coverage=%d, max=%d, fraction of downsampled sites=%.2e", - message, loc, thisSampleReadStates, downsamplingTarget, (1.0 * nSitesNeedingDownsampling / nSites))); + message, loc, size(), downsamplingTarget, (1.0 * nSitesNeedingDownsampling / nSites))); } } @@ -130,17 +218,13 @@ final class PerSampleReadStateManager implements Iterable return readStatesByAlignmentStart.isEmpty(); } - public AlignmentStateMachine peek() { - return isEmpty() ? null : readStatesByAlignmentStart.get(0).peek(); - } - /** * Get the number of read states currently in this manager * @return the number of read states */ @Ensures("result >= 0") public int size() { - return thisSampleReadStates; + return readStatesByAlignmentStart.size(); } /** @@ -166,38 +250,12 @@ final class PerSampleReadStateManager implements Iterable return nRemoved; } - // todo -- reimplement + /** + * Iterate over the AlignmentStateMachine in this manager in alignment start order. + * @return a valid iterator + */ + @Ensures("result != null") public Iterator iterator() { - return new Iterator() { - private final Iterator> alignmentStartIterator = readStatesByAlignmentStart.iterator(); - private LinkedList currentPositionReadStates; - private Iterator currentPositionReadStatesIterator; - - @Override - public boolean hasNext() { - return alignmentStartIterator.hasNext() || - (currentPositionReadStatesIterator != null && currentPositionReadStatesIterator.hasNext()); - } - - @Override - public AlignmentStateMachine next() { - if ( currentPositionReadStatesIterator == null || ! currentPositionReadStatesIterator.hasNext() ) { - currentPositionReadStates = alignmentStartIterator.next(); - currentPositionReadStatesIterator = currentPositionReadStates.iterator(); - } - - return currentPositionReadStatesIterator.next(); - } - - @Override - public void remove() { - currentPositionReadStatesIterator.remove(); - thisSampleReadStates--; - - if ( currentPositionReadStates.isEmpty() ) { - alignmentStartIterator.remove(); - } - } - }; + return readStatesByAlignmentStart.iterator(); } } diff --git a/public/java/src/org/broadinstitute/sting/utils/locusiterator/ReadStateManager.java b/public/java/src/org/broadinstitute/sting/utils/locusiterator/ReadStateManager.java index 4011875a6..09ec3b264 100644 --- a/public/java/src/org/broadinstitute/sting/utils/locusiterator/ReadStateManager.java +++ b/public/java/src/org/broadinstitute/sting/utils/locusiterator/ReadStateManager.java @@ -122,7 +122,7 @@ final class ReadStateManager implements Iterable