diff --git a/public/java/src/org/broadinstitute/sting/utils/locusiterator/PerSampleReadStateManager.java b/public/java/src/org/broadinstitute/sting/utils/locusiterator/PerSampleReadStateManager.java index c2a47bbdb..3f3bc706f 100644 --- a/public/java/src/org/broadinstitute/sting/utils/locusiterator/PerSampleReadStateManager.java +++ b/public/java/src/org/broadinstitute/sting/utils/locusiterator/PerSampleReadStateManager.java @@ -26,6 +26,7 @@ package org.broadinstitute.sting.utils.locusiterator; import com.google.java.contract.Ensures; +import com.google.java.contract.Invariant; import com.google.java.contract.Requires; import net.sf.samtools.CigarOperator; import org.apache.log4j.Logger; @@ -43,18 +44,42 @@ import java.util.List; * Date: 1/13/13 * Time: 12:28 PM */ +@Invariant({ + "readStartsAreWellOrdered()", + "! isDownsampling() || downsamplingTarget > 0", + "nSites >= 0", + "nSitesNeedingDownsampling >= 0", + "nSitesNeedingDownsampling <= nSites" +}) final class PerSampleReadStateManager implements Iterable { private final static Logger logger = Logger.getLogger(ReadStateManager.class); - private final static boolean CAPTURE_DOWNSAMPLING_STATS = true; + private final static boolean CAPTURE_DOWNSAMPLING_STATS = false; + + /** + * A list (potentially empty) of alignment state machines. + * + * The state machines must be ordered by the alignment start of their underlying reads, with the + * lowest alignment starts on the left, and the largest on the right + */ + private LinkedList readStatesByAlignmentStart = new LinkedList(); - private List> readStatesByAlignmentStart = new LinkedList>(); private final Downsampler> levelingDownsampler; - private int thisSampleReadStates = 0; - private final int downsamplingTarget; + + /** + * The number of sites where downsampling has been invoked + */ private int nSitesNeedingDownsampling = 0; + + /** + * The number of sites we've visited + */ private int nSites = 0; + /** + * Create a new PerSampleReadStateManager with downsampling parameters as requested by LIBSDownsamplingInfo + * @param LIBSDownsamplingInfo the downsampling params we want to use + */ public PerSampleReadStateManager(final LIBSDownsamplingInfo LIBSDownsamplingInfo) { this.downsamplingTarget = LIBSDownsamplingInfo.isPerformDownsampling() ? LIBSDownsamplingInfo.getToCoverage() : -1; this.levelingDownsampler = LIBSDownsamplingInfo.isPerformDownsampling() @@ -62,55 +87,118 @@ final class PerSampleReadStateManager implements Iterable : null; } + /** + * Group the underlying readStatesByAlignmentStart into a list of list of alignment state machines, + * where each list contains machines with a unique genome site. The outer list is ordered + * by alignment start. + * + * For example, if the flat list has alignment starts [10, 10, 11, 12, 12, 13] then + * the resulting grouping will be [[10, 10], [11], [12, 12], [13]]. + * + * @return a non-null list of lists + */ + @Ensures("result != null") + private List> groupByAlignmentStart() { + final LinkedList> grouped = new LinkedList>(); + + AlignmentStateMachine last = null; + for ( final AlignmentStateMachine stateMachine : readStatesByAlignmentStart ) { + if ( last == null || stateMachine.getGenomeOffset() != last.getGenomeOffset() ) { + // we've advanced to a place where the state machine has a different state, + // so start a new list + grouped.add(new LinkedList()); + last = stateMachine; + } + grouped.getLast().add(stateMachine); + } + + return grouped; + } + + /** + * Flattens the grouped list of list of alignment state machines into a single list in order + * @return a non-null list contains the state machines + */ + @Ensures("result != null") + private LinkedList flattenByAlignmentStart(final List> grouped) { + final LinkedList flat = new LinkedList(); + for ( final List l : grouped ) + flat.addAll(l); + return flat; + } + + /** + * Test that the reads are ordered by their alignment starts + * @return true if well ordered, false otherwise + */ + private boolean readStartsAreWellOrdered() { + int lastStart = -1; + for ( final AlignmentStateMachine machine : readStatesByAlignmentStart ) { + if ( lastStart > machine.getRead().getAlignmentStart() ) + return false; + lastStart = machine.getRead().getAlignmentStart(); + } + return true; + } + /** * Assumes it can just keep the states linked lists without making a copy * @param states the new states to add to this manager - * @return The change in the number of states, after including states and potentially downsampling + * @return The change in the number of states, after including states and potentially downsampling. Note + * that this return result might be negative, if downsampling is enabled, as we might drop + * more sites than have been added by the downsampler */ @Requires("states != null") - @Ensures("result >= 0") - public int addStatesAtNextAlignmentStart(LinkedList states) { + public int addStatesAtNextAlignmentStart(final LinkedList states) { if ( states.isEmpty() ) { return 0; } - readStatesByAlignmentStart.add(states); + readStatesByAlignmentStart.addAll(states); int nStatesAdded = states.size(); - if ( isDownsampling() ) { + if ( isDownsampling() && readStatesByAlignmentStart.size() > downsamplingTarget ) { + // only go into the downsampling branch if we are downsampling and the coverage > the target captureDownsamplingStats(); - levelingDownsampler.submit(readStatesByAlignmentStart); + levelingDownsampler.submit(groupByAlignmentStart()); levelingDownsampler.signalEndOfInput(); nStatesAdded -= levelingDownsampler.getNumberOfDiscardedItems(); // use returned List directly rather than make a copy, for efficiency's sake - readStatesByAlignmentStart = levelingDownsampler.consumeFinalizedItems(); + readStatesByAlignmentStart = flattenByAlignmentStart(levelingDownsampler.consumeFinalizedItems()); levelingDownsampler.reset(); } - thisSampleReadStates += nStatesAdded; return nStatesAdded; } + /** + * Is downsampling enabled for this manager? + * @return true if we are downsampling, false otherwise + */ private boolean isDownsampling() { return levelingDownsampler != null; } - private AlignmentStateMachine getFirst() { - if (readStatesByAlignmentStart.isEmpty()) - return null; - else - return readStatesByAlignmentStart.get(0).getFirst(); + /** + * Get the leftmost alignment state machine, or null if the read states is empty + * @return a potentially null AlignmentStateMachine + */ + public AlignmentStateMachine getFirst() { + return isEmpty() ? null : readStatesByAlignmentStart.getFirst(); } + /** + * Capture some statistics about the behavior of the downsampling, but only if CAPTURE_DOWNSAMPLING_STATS is true + */ @Requires("isDownsampling()") private void captureDownsamplingStats() { if ( CAPTURE_DOWNSAMPLING_STATS ) { nSites++; final int loc = getFirst().getGenomePosition(); String message = "Pass through"; - final boolean downsampling = thisSampleReadStates > downsamplingTarget; + final boolean downsampling = size() > downsamplingTarget; if ( downsampling ) { nSitesNeedingDownsampling++; message = "Downsampling"; @@ -118,7 +206,7 @@ final class PerSampleReadStateManager implements Iterable if ( downsampling || nSites % 10000 == 0 ) logger.info(String.format("%20s at %s: coverage=%d, max=%d, fraction of downsampled sites=%.2e", - message, loc, thisSampleReadStates, downsamplingTarget, (1.0 * nSitesNeedingDownsampling / nSites))); + message, loc, size(), downsamplingTarget, (1.0 * nSitesNeedingDownsampling / nSites))); } } @@ -130,17 +218,13 @@ final class PerSampleReadStateManager implements Iterable return readStatesByAlignmentStart.isEmpty(); } - public AlignmentStateMachine peek() { - return isEmpty() ? null : readStatesByAlignmentStart.get(0).peek(); - } - /** * Get the number of read states currently in this manager * @return the number of read states */ @Ensures("result >= 0") public int size() { - return thisSampleReadStates; + return readStatesByAlignmentStart.size(); } /** @@ -166,38 +250,12 @@ final class PerSampleReadStateManager implements Iterable return nRemoved; } - // todo -- reimplement + /** + * Iterate over the AlignmentStateMachine in this manager in alignment start order. + * @return a valid iterator + */ + @Ensures("result != null") public Iterator iterator() { - return new Iterator() { - private final Iterator> alignmentStartIterator = readStatesByAlignmentStart.iterator(); - private LinkedList currentPositionReadStates; - private Iterator currentPositionReadStatesIterator; - - @Override - public boolean hasNext() { - return alignmentStartIterator.hasNext() || - (currentPositionReadStatesIterator != null && currentPositionReadStatesIterator.hasNext()); - } - - @Override - public AlignmentStateMachine next() { - if ( currentPositionReadStatesIterator == null || ! currentPositionReadStatesIterator.hasNext() ) { - currentPositionReadStates = alignmentStartIterator.next(); - currentPositionReadStatesIterator = currentPositionReadStates.iterator(); - } - - return currentPositionReadStatesIterator.next(); - } - - @Override - public void remove() { - currentPositionReadStatesIterator.remove(); - thisSampleReadStates--; - - if ( currentPositionReadStates.isEmpty() ) { - alignmentStartIterator.remove(); - } - } - }; + return readStatesByAlignmentStart.iterator(); } } diff --git a/public/java/src/org/broadinstitute/sting/utils/locusiterator/ReadStateManager.java b/public/java/src/org/broadinstitute/sting/utils/locusiterator/ReadStateManager.java index 4011875a6..09ec3b264 100644 --- a/public/java/src/org/broadinstitute/sting/utils/locusiterator/ReadStateManager.java +++ b/public/java/src/org/broadinstitute/sting/utils/locusiterator/ReadStateManager.java @@ -122,7 +122,7 @@ final class ReadStateManager implements Iterable