Optimization for LIBS: PerSampleReadStateManager now uses a simple LinkedList of AlignmentStateMachine
-- Instead of storing a list of list of alignment starts, which is expensive to manipulate, we instead store a linear list of alignment starts. Not grouped as previously. This enables us to simplify iteration and update operations, making them much faster -- Critically, the downsampler still requires this list of list. We convert back and forth between these two representations as required, which is very rarely for normal data sets (WGS NA12878 on chr20 is 0.2%, 4x WGS is even less).
This commit is contained in:
parent
5a5422e4f8
commit
c7f0ca8ac5
|
|
@ -26,6 +26,7 @@
|
|||
package org.broadinstitute.sting.utils.locusiterator;
|
||||
|
||||
import com.google.java.contract.Ensures;
|
||||
import com.google.java.contract.Invariant;
|
||||
import com.google.java.contract.Requires;
|
||||
import net.sf.samtools.CigarOperator;
|
||||
import org.apache.log4j.Logger;
|
||||
|
|
@ -43,18 +44,42 @@ import java.util.List;
|
|||
* Date: 1/13/13
|
||||
* Time: 12:28 PM
|
||||
*/
|
||||
@Invariant({
|
||||
"readStartsAreWellOrdered()",
|
||||
"! isDownsampling() || downsamplingTarget > 0",
|
||||
"nSites >= 0",
|
||||
"nSitesNeedingDownsampling >= 0",
|
||||
"nSitesNeedingDownsampling <= nSites"
|
||||
})
|
||||
final class PerSampleReadStateManager implements Iterable<AlignmentStateMachine> {
|
||||
private final static Logger logger = Logger.getLogger(ReadStateManager.class);
|
||||
private final static boolean CAPTURE_DOWNSAMPLING_STATS = true;
|
||||
private final static boolean CAPTURE_DOWNSAMPLING_STATS = false;
|
||||
|
||||
/**
|
||||
* A list (potentially empty) of alignment state machines.
|
||||
*
|
||||
* The state machines must be ordered by the alignment start of their underlying reads, with the
|
||||
* lowest alignment starts on the left, and the largest on the right
|
||||
*/
|
||||
private LinkedList<AlignmentStateMachine> readStatesByAlignmentStart = new LinkedList<AlignmentStateMachine>();
|
||||
|
||||
private List<LinkedList<AlignmentStateMachine>> readStatesByAlignmentStart = new LinkedList<LinkedList<AlignmentStateMachine>>();
|
||||
private final Downsampler<LinkedList<AlignmentStateMachine>> levelingDownsampler;
|
||||
private int thisSampleReadStates = 0;
|
||||
|
||||
private final int downsamplingTarget;
|
||||
|
||||
/**
|
||||
* The number of sites where downsampling has been invoked
|
||||
*/
|
||||
private int nSitesNeedingDownsampling = 0;
|
||||
|
||||
/**
|
||||
* The number of sites we've visited
|
||||
*/
|
||||
private int nSites = 0;
|
||||
|
||||
/**
|
||||
* Create a new PerSampleReadStateManager with downsampling parameters as requested by LIBSDownsamplingInfo
|
||||
* @param LIBSDownsamplingInfo the downsampling params we want to use
|
||||
*/
|
||||
public PerSampleReadStateManager(final LIBSDownsamplingInfo LIBSDownsamplingInfo) {
|
||||
this.downsamplingTarget = LIBSDownsamplingInfo.isPerformDownsampling() ? LIBSDownsamplingInfo.getToCoverage() : -1;
|
||||
this.levelingDownsampler = LIBSDownsamplingInfo.isPerformDownsampling()
|
||||
|
|
@ -62,55 +87,118 @@ final class PerSampleReadStateManager implements Iterable<AlignmentStateMachine>
|
|||
: null;
|
||||
}
|
||||
|
||||
/**
|
||||
* Group the underlying readStatesByAlignmentStart into a list of list of alignment state machines,
|
||||
* where each list contains machines with a unique genome site. The outer list is ordered
|
||||
* by alignment start.
|
||||
*
|
||||
* For example, if the flat list has alignment starts [10, 10, 11, 12, 12, 13] then
|
||||
* the resulting grouping will be [[10, 10], [11], [12, 12], [13]].
|
||||
*
|
||||
* @return a non-null list of lists
|
||||
*/
|
||||
@Ensures("result != null")
|
||||
private List<LinkedList<AlignmentStateMachine>> groupByAlignmentStart() {
|
||||
final LinkedList<LinkedList<AlignmentStateMachine>> grouped = new LinkedList<LinkedList<AlignmentStateMachine>>();
|
||||
|
||||
AlignmentStateMachine last = null;
|
||||
for ( final AlignmentStateMachine stateMachine : readStatesByAlignmentStart ) {
|
||||
if ( last == null || stateMachine.getGenomeOffset() != last.getGenomeOffset() ) {
|
||||
// we've advanced to a place where the state machine has a different state,
|
||||
// so start a new list
|
||||
grouped.add(new LinkedList<AlignmentStateMachine>());
|
||||
last = stateMachine;
|
||||
}
|
||||
grouped.getLast().add(stateMachine);
|
||||
}
|
||||
|
||||
return grouped;
|
||||
}
|
||||
|
||||
/**
|
||||
* Flattens the grouped list of list of alignment state machines into a single list in order
|
||||
* @return a non-null list contains the state machines
|
||||
*/
|
||||
@Ensures("result != null")
|
||||
private LinkedList<AlignmentStateMachine> flattenByAlignmentStart(final List<LinkedList<AlignmentStateMachine>> grouped) {
|
||||
final LinkedList<AlignmentStateMachine> flat = new LinkedList<AlignmentStateMachine>();
|
||||
for ( final List<AlignmentStateMachine> l : grouped )
|
||||
flat.addAll(l);
|
||||
return flat;
|
||||
}
|
||||
|
||||
/**
|
||||
* Test that the reads are ordered by their alignment starts
|
||||
* @return true if well ordered, false otherwise
|
||||
*/
|
||||
private boolean readStartsAreWellOrdered() {
|
||||
int lastStart = -1;
|
||||
for ( final AlignmentStateMachine machine : readStatesByAlignmentStart ) {
|
||||
if ( lastStart > machine.getRead().getAlignmentStart() )
|
||||
return false;
|
||||
lastStart = machine.getRead().getAlignmentStart();
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
/**
|
||||
* Assumes it can just keep the states linked lists without making a copy
|
||||
* @param states the new states to add to this manager
|
||||
* @return The change in the number of states, after including states and potentially downsampling
|
||||
* @return The change in the number of states, after including states and potentially downsampling. Note
|
||||
* that this return result might be negative, if downsampling is enabled, as we might drop
|
||||
* more sites than have been added by the downsampler
|
||||
*/
|
||||
@Requires("states != null")
|
||||
@Ensures("result >= 0")
|
||||
public int addStatesAtNextAlignmentStart(LinkedList<AlignmentStateMachine> states) {
|
||||
public int addStatesAtNextAlignmentStart(final LinkedList<AlignmentStateMachine> states) {
|
||||
if ( states.isEmpty() ) {
|
||||
return 0;
|
||||
}
|
||||
|
||||
readStatesByAlignmentStart.add(states);
|
||||
readStatesByAlignmentStart.addAll(states);
|
||||
int nStatesAdded = states.size();
|
||||
|
||||
if ( isDownsampling() ) {
|
||||
if ( isDownsampling() && readStatesByAlignmentStart.size() > downsamplingTarget ) {
|
||||
// only go into the downsampling branch if we are downsampling and the coverage > the target
|
||||
captureDownsamplingStats();
|
||||
levelingDownsampler.submit(readStatesByAlignmentStart);
|
||||
levelingDownsampler.submit(groupByAlignmentStart());
|
||||
levelingDownsampler.signalEndOfInput();
|
||||
|
||||
nStatesAdded -= levelingDownsampler.getNumberOfDiscardedItems();
|
||||
|
||||
// use returned List directly rather than make a copy, for efficiency's sake
|
||||
readStatesByAlignmentStart = levelingDownsampler.consumeFinalizedItems();
|
||||
readStatesByAlignmentStart = flattenByAlignmentStart(levelingDownsampler.consumeFinalizedItems());
|
||||
levelingDownsampler.reset();
|
||||
}
|
||||
|
||||
thisSampleReadStates += nStatesAdded;
|
||||
return nStatesAdded;
|
||||
}
|
||||
|
||||
/**
|
||||
* Is downsampling enabled for this manager?
|
||||
* @return true if we are downsampling, false otherwise
|
||||
*/
|
||||
private boolean isDownsampling() {
|
||||
return levelingDownsampler != null;
|
||||
}
|
||||
|
||||
private AlignmentStateMachine getFirst() {
|
||||
if (readStatesByAlignmentStart.isEmpty())
|
||||
return null;
|
||||
else
|
||||
return readStatesByAlignmentStart.get(0).getFirst();
|
||||
/**
|
||||
* Get the leftmost alignment state machine, or null if the read states is empty
|
||||
* @return a potentially null AlignmentStateMachine
|
||||
*/
|
||||
public AlignmentStateMachine getFirst() {
|
||||
return isEmpty() ? null : readStatesByAlignmentStart.getFirst();
|
||||
}
|
||||
|
||||
/**
|
||||
* Capture some statistics about the behavior of the downsampling, but only if CAPTURE_DOWNSAMPLING_STATS is true
|
||||
*/
|
||||
@Requires("isDownsampling()")
|
||||
private void captureDownsamplingStats() {
|
||||
if ( CAPTURE_DOWNSAMPLING_STATS ) {
|
||||
nSites++;
|
||||
final int loc = getFirst().getGenomePosition();
|
||||
String message = "Pass through";
|
||||
final boolean downsampling = thisSampleReadStates > downsamplingTarget;
|
||||
final boolean downsampling = size() > downsamplingTarget;
|
||||
if ( downsampling ) {
|
||||
nSitesNeedingDownsampling++;
|
||||
message = "Downsampling";
|
||||
|
|
@ -118,7 +206,7 @@ final class PerSampleReadStateManager implements Iterable<AlignmentStateMachine>
|
|||
|
||||
if ( downsampling || nSites % 10000 == 0 )
|
||||
logger.info(String.format("%20s at %s: coverage=%d, max=%d, fraction of downsampled sites=%.2e",
|
||||
message, loc, thisSampleReadStates, downsamplingTarget, (1.0 * nSitesNeedingDownsampling / nSites)));
|
||||
message, loc, size(), downsamplingTarget, (1.0 * nSitesNeedingDownsampling / nSites)));
|
||||
}
|
||||
}
|
||||
|
||||
|
|
@ -130,17 +218,13 @@ final class PerSampleReadStateManager implements Iterable<AlignmentStateMachine>
|
|||
return readStatesByAlignmentStart.isEmpty();
|
||||
}
|
||||
|
||||
public AlignmentStateMachine peek() {
|
||||
return isEmpty() ? null : readStatesByAlignmentStart.get(0).peek();
|
||||
}
|
||||
|
||||
/**
|
||||
* Get the number of read states currently in this manager
|
||||
* @return the number of read states
|
||||
*/
|
||||
@Ensures("result >= 0")
|
||||
public int size() {
|
||||
return thisSampleReadStates;
|
||||
return readStatesByAlignmentStart.size();
|
||||
}
|
||||
|
||||
/**
|
||||
|
|
@ -166,38 +250,12 @@ final class PerSampleReadStateManager implements Iterable<AlignmentStateMachine>
|
|||
return nRemoved;
|
||||
}
|
||||
|
||||
// todo -- reimplement
|
||||
/**
|
||||
* Iterate over the AlignmentStateMachine in this manager in alignment start order.
|
||||
* @return a valid iterator
|
||||
*/
|
||||
@Ensures("result != null")
|
||||
public Iterator<AlignmentStateMachine> iterator() {
|
||||
return new Iterator<AlignmentStateMachine>() {
|
||||
private final Iterator<LinkedList<AlignmentStateMachine>> alignmentStartIterator = readStatesByAlignmentStart.iterator();
|
||||
private LinkedList<AlignmentStateMachine> currentPositionReadStates;
|
||||
private Iterator<AlignmentStateMachine> currentPositionReadStatesIterator;
|
||||
|
||||
@Override
|
||||
public boolean hasNext() {
|
||||
return alignmentStartIterator.hasNext() ||
|
||||
(currentPositionReadStatesIterator != null && currentPositionReadStatesIterator.hasNext());
|
||||
}
|
||||
|
||||
@Override
|
||||
public AlignmentStateMachine next() {
|
||||
if ( currentPositionReadStatesIterator == null || ! currentPositionReadStatesIterator.hasNext() ) {
|
||||
currentPositionReadStates = alignmentStartIterator.next();
|
||||
currentPositionReadStatesIterator = currentPositionReadStates.iterator();
|
||||
}
|
||||
|
||||
return currentPositionReadStatesIterator.next();
|
||||
}
|
||||
|
||||
@Override
|
||||
public void remove() {
|
||||
currentPositionReadStatesIterator.remove();
|
||||
thisSampleReadStates--;
|
||||
|
||||
if ( currentPositionReadStates.isEmpty() ) {
|
||||
alignmentStartIterator.remove();
|
||||
}
|
||||
}
|
||||
};
|
||||
return readStatesByAlignmentStart.iterator();
|
||||
}
|
||||
}
|
||||
|
|
|
|||
|
|
@ -122,7 +122,7 @@ final class ReadStateManager implements Iterable<Map.Entry<String, PerSampleRead
|
|||
public AlignmentStateMachine getFirst() {
|
||||
for ( final PerSampleReadStateManager manager : readStatesBySample.values() ) {
|
||||
if ( ! manager.isEmpty() )
|
||||
return manager.peek();
|
||||
return manager.getFirst();
|
||||
}
|
||||
return null;
|
||||
}
|
||||
|
|
|
|||
Loading…
Reference in New Issue