LIBS optimizations and performance tools

-- Made LIBSPerformance a full featured CommandLineProgram, and it can be used to assess the LIBS performance by reading a provided BAM
-- ReadStateManager now provides a clean interface to iterate in sample order the per-sample read states, allowing us to avoid many map.get calls
-- Moved updateReadStates to ReadStateManager
-- Removed the unnecessary wrapping of an iterator in ReadStateManager
-- readStatesBySample is now a LinkedHashMap so that iteration occurs in LIBS sample order, allowing us to avoid many unnecessary calls to map.get iterating over samples.  Now those are just map native iterations
-- Restructured collectPendingReads for simplicity, removing redundant and consolidating common range checks.  The new piece is code is much clearer and avoids several unnecessary function calls
This commit is contained in:
Mark DePristo 2013-01-12 12:41:13 -05:00
parent ec05ecef60
commit 83fcc06e28
4 changed files with 99 additions and 97 deletions

View File

@ -113,6 +113,16 @@ public class AlignmentStateMachine {
return read; return read;
} }
/**
* Get the reference index of the underlying read
*
* @return the reference index of the read
*/
@Ensures("result == getRead().getReferenceIndex()")
public int getReferenceIndex() {
return getRead().getReferenceIndex();
}
/** /**
* Is this the left edge state? I.e., one that is before or after the current read? * Is this the left edge state? I.e., one that is before or after the current read?
* @return true if this state is an edge state, false otherwise * @return true if this state is an edge state, false otherwise

View File

@ -34,8 +34,7 @@ import org.broadinstitute.sting.gatk.contexts.AlignmentContext;
import org.broadinstitute.sting.gatk.downsampling.DownsampleType; import org.broadinstitute.sting.gatk.downsampling.DownsampleType;
import org.broadinstitute.sting.utils.GenomeLoc; import org.broadinstitute.sting.utils.GenomeLoc;
import org.broadinstitute.sting.utils.GenomeLocParser; import org.broadinstitute.sting.utils.GenomeLocParser;
import org.broadinstitute.sting.utils.pileup.PileupElement; import org.broadinstitute.sting.utils.pileup.*;
import org.broadinstitute.sting.utils.pileup.ReadBackedPileupImpl;
import org.broadinstitute.sting.utils.sam.GATKSAMRecord; import org.broadinstitute.sting.utils.sam.GATKSAMRecord;
import org.broadinstitute.sting.utils.sam.ReadUtils; import org.broadinstitute.sting.utils.sam.ReadUtils;
@ -234,17 +233,16 @@ public class LocusIteratorByState extends LocusIterator {
final GenomeLoc location = getLocation(); final GenomeLoc location = getLocation();
final Map<String, ReadBackedPileupImpl> fullPileup = new HashMap<String, ReadBackedPileupImpl>(); final Map<String, ReadBackedPileupImpl> fullPileup = new HashMap<String, ReadBackedPileupImpl>();
// TODO: How can you determine here whether the current pileup has been downsampled? for (final Map.Entry<String, ReadStateManager.PerSampleReadStateManager> sampleStatePair : readStates ) {
boolean hasBeenSampled = false; final String sample = sampleStatePair.getKey();
final ReadStateManager.PerSampleReadStateManager readState = sampleStatePair.getValue();
for (final String sample : samples) { final Iterator<AlignmentStateMachine> iterator = readState.iterator();
final Iterator<AlignmentStateMachine> iterator = readStates.iterator(sample); final List<PileupElement> pile = new ArrayList<PileupElement>(readState.size());
final List<PileupElement> pile = new ArrayList<PileupElement>(readStates.size(sample));
while (iterator.hasNext()) { while (iterator.hasNext()) {
// state object with the read/offset information // state object with the read/offset information
final AlignmentStateMachine state = iterator.next(); final AlignmentStateMachine state = iterator.next();
final GATKSAMRecord read = (GATKSAMRecord) state.getRead(); final GATKSAMRecord read = state.getRead();
final CigarOperator op = state.getCigarOperator(); final CigarOperator op = state.getCigarOperator();
if (op == CigarOperator.N) // N's are never added to any pileup if (op == CigarOperator.N) // N's are never added to any pileup
@ -263,29 +261,9 @@ public class LocusIteratorByState extends LocusIterator {
fullPileup.put(sample, new ReadBackedPileupImpl(location, pile)); fullPileup.put(sample, new ReadBackedPileupImpl(location, pile));
} }
updateReadStates(); // critical - must be called after we get the current state offsets and location readStates.updateReadStates(); // critical - must be called after we get the current state offsets and location
if (!fullPileup.isEmpty()) // if we got reads with non-D/N over the current position, we are done if (!fullPileup.isEmpty()) // if we got reads with non-D/N over the current position, we are done
nextAlignmentContext = new AlignmentContext(location, new ReadBackedPileupImpl(location, fullPileup), hasBeenSampled); nextAlignmentContext = new AlignmentContext(location, new ReadBackedPileupImpl(location, fullPileup), false);
}
}
/**
* Advances all fo the read states by one bp. After this call the read states are reflective
* of the next pileup.
*/
private void updateReadStates() {
for (final String sample : samples) {
Iterator<AlignmentStateMachine> it = readStates.iterator(sample);
while (it.hasNext()) {
AlignmentStateMachine state = it.next();
CigarOperator op = state.stepForwardOnGenome();
if (op == null) {
// we discard the read only when we are past its end AND indel at the end of the read (if any) was
// already processed. Keeping the read state that returned null upon stepForwardOnGenome() is safe
// as the next call to stepForwardOnGenome() will return null again AND will clear hadIndel() flag.
it.remove(); // we've stepped off the end of the object
}
}
} }
} }

View File

@ -28,6 +28,7 @@ package org.broadinstitute.sting.utils.locusiterator;
import com.google.java.contract.Ensures; import com.google.java.contract.Ensures;
import com.google.java.contract.Requires; import com.google.java.contract.Requires;
import net.sf.picard.util.PeekableIterator; import net.sf.picard.util.PeekableIterator;
import net.sf.samtools.CigarOperator;
import org.broadinstitute.sting.gatk.downsampling.Downsampler; import org.broadinstitute.sting.gatk.downsampling.Downsampler;
import org.broadinstitute.sting.gatk.downsampling.LevelingDownsampler; import org.broadinstitute.sting.gatk.downsampling.LevelingDownsampler;
import org.broadinstitute.sting.utils.sam.GATKSAMRecord; import org.broadinstitute.sting.utils.sam.GATKSAMRecord;
@ -48,11 +49,18 @@ import java.util.*;
* Date: 1/5/13 * Date: 1/5/13
* Time: 2:02 PM * Time: 2:02 PM
*/ */
class ReadStateManager { final class ReadStateManager implements Iterable<Map.Entry<String, ReadStateManager.PerSampleReadStateManager>> {
private final List<String> samples; private final List<String> samples;
private final PeekableIterator<GATKSAMRecord> iterator; private final PeekableIterator<GATKSAMRecord> iterator;
private final SamplePartitioner<GATKSAMRecord> samplePartitioner; private final SamplePartitioner<GATKSAMRecord> samplePartitioner;
private final Map<String, PerSampleReadStateManager> readStatesBySample = new HashMap<String, PerSampleReadStateManager>();
/**
* A mapping from sample name -> the per sample read state manager that manages
*
* IT IS CRITICAL THAT THIS BE A LINKED HASH MAP, SO THAT THE ITERATION OF THE MAP OCCURS IN THE SAME
* ORDER AS THE ORIGINL SAMPLES
*/
private final Map<String, PerSampleReadStateManager> readStatesBySample = new LinkedHashMap<String, PerSampleReadStateManager>();
private LinkedList<GATKSAMRecord> submittedReads; private LinkedList<GATKSAMRecord> submittedReads;
private final boolean keepSubmittedReads; private final boolean keepSubmittedReads;
@ -70,6 +78,7 @@ class ReadStateManager {
this.submittedReads = new LinkedList<GATKSAMRecord>(); this.submittedReads = new LinkedList<GATKSAMRecord>();
for (final String sample : samples) { for (final String sample : samples) {
// because this is a linked hash map the order of iteration will be in sample order
readStatesBySample.put(sample, new PerSampleReadStateManager(LIBSDownsamplingInfo)); readStatesBySample.put(sample, new PerSampleReadStateManager(LIBSDownsamplingInfo));
} }
@ -77,29 +86,16 @@ class ReadStateManager {
} }
/** /**
* Returns a iterator over all the reads associated with the given sample. Note that remove() is implemented * Returns a iterator over all the sample -> per-sample read state managers with each sample in this read state manager.
* for this iterator; if present, total read states will be decremented.
* *
* @param sample The sample. * The order of iteration is the same as the order of the samples provided upon construction to this
* @return Iterator over the reads associated with that sample. * ReadStateManager.
*
* @return Iterator over sample + per sample read state manager pairs for this read state manager.
*/ */
public Iterator<AlignmentStateMachine> iterator(final String sample) { @Override
// TODO -- why is this wrapped? public Iterator<Map.Entry<String, ReadStateManager.PerSampleReadStateManager>> iterator() {
return new Iterator<AlignmentStateMachine>() { return readStatesBySample.entrySet().iterator();
private Iterator<AlignmentStateMachine> wrappedIterator = readStatesBySample.get(sample).iterator();
public boolean hasNext() {
return wrappedIterator.hasNext();
}
public AlignmentStateMachine next() {
return wrappedIterator.next();
}
public void remove() {
wrappedIterator.remove();
}
};
} }
public boolean isEmpty() { public boolean isEmpty() {
@ -126,10 +122,9 @@ class ReadStateManager {
} }
public AlignmentStateMachine getFirst() { public AlignmentStateMachine getFirst() {
for (final String sample : samples) { for ( final PerSampleReadStateManager manager : readStatesBySample.values() ) {
PerSampleReadStateManager reads = readStatesBySample.get(sample); if ( ! manager.isEmpty() )
if (!reads.isEmpty()) return manager.peek();
return reads.peek();
} }
return null; return null;
} }
@ -138,51 +133,65 @@ class ReadStateManager {
return totalReadStates > 0 || iterator.hasNext(); return totalReadStates > 0 || iterator.hasNext();
} }
// fast testing of position
/** /**
* TODO -- this function needs to be optimized * Advances all fo the read states by one bp. After this call the read states are reflective
* * of the next pileup.
* Notes:
* -- the only place where it's called is in a block where we know isEmpty is false
* -- getFirst() is quite expensive, and it seems that we could cache this value in the outer
* block, and then pass this in as an argument
*
* @param read
* @return
*/ */
private boolean readIsPastCurrentPosition(GATKSAMRecord read) { public void updateReadStates() {
if (isEmpty()) for (final PerSampleReadStateManager readStateManager : readStatesBySample.values() ) {
return false; final Iterator<AlignmentStateMachine> it = readStateManager.iterator();
else { while (it.hasNext()) {
final AlignmentStateMachine state = getFirst(); final AlignmentStateMachine state = it.next();
final GATKSAMRecord ourRead = state.getRead(); final CigarOperator op = state.stepForwardOnGenome();
return read.getReferenceIndex() > ourRead.getReferenceIndex() || read.getAlignmentStart() > state.getGenomePosition(); if (op == null) {
// we discard the read only when we are past its end AND indel at the end of the read (if any) was
// already processed. Keeping the read state that returned null upon stepForwardOnGenome() is safe
// as the next call to stepForwardOnGenome() will return null again AND will clear hadIndel() flag.
it.remove(); // we've stepped off the end of the object
}
}
} }
} }
/**
* Does read start at the same position as described by currentContextIndex and currentAlignmentStart?
*
* @param read the read we want to test
* @param currentContigIndex the contig index (from the read's getReferenceIndex) of the reads in this state manager
* @param currentAlignmentStart the alignment start of the of the left-most position on the
* genome of the reads in this read state manager
* @return true if read has contig index and start equal to the current ones
*/
private boolean readStartsAtCurrentPosition(final GATKSAMRecord read, final int currentContigIndex, final int currentAlignmentStart) {
return read.getAlignmentStart() == currentAlignmentStart && read.getReferenceIndex() == currentContigIndex;
}
/**
* Pull all of the reads off the iterator that overlap the left-most position among all
* reads this ReadStateManager
*/
public void collectPendingReads() { public void collectPendingReads() {
if (!iterator.hasNext()) if (!iterator.hasNext())
return; return;
// the next record in the stream, peeked as to not remove it from the stream // determine the left-most boundary that determines which reads to keep in this new pileup
final int firstContigIndex;
final int firstAlignmentStart;
if ( isEmpty() ) { if ( isEmpty() ) {
final int firstContigIndex = iterator.peek().getReferenceIndex(); // there are no reads here, so our next state is the next read in the stream
final int firstAlignmentStart = iterator.peek().getAlignmentStart(); firstContigIndex = iterator.peek().getReferenceIndex();
while (iterator.hasNext() && iterator.peek().getReferenceIndex() == firstContigIndex && iterator.peek().getAlignmentStart() == firstAlignmentStart) { firstAlignmentStart = iterator.peek().getAlignmentStart();
submitRead(iterator.next());
}
} else { } else {
// Fast fail in the case that the read is past the current position. // there's a read in the system, so it's our targeted first read
if (readIsPastCurrentPosition(iterator.peek())) final AlignmentStateMachine firstState = getFirst();
return; firstContigIndex = firstState.getReferenceIndex();
// note this isn't the alignment start of the read, but rather the alignment start position
while (iterator.hasNext() && !readIsPastCurrentPosition(iterator.peek())) { firstAlignmentStart = firstState.getGenomePosition();
submitRead(iterator.next());
}
} }
samplePartitioner.doneSubmittingReads(); while ( iterator.hasNext() && readStartsAtCurrentPosition(iterator.peek(), firstContigIndex, firstAlignmentStart) ) {
submitRead(iterator.next());
}
for (final String sample : samples) { for (final String sample : samples) {
final Collection<GATKSAMRecord> newReads = samplePartitioner.getReadsForSample(sample); final Collection<GATKSAMRecord> newReads = samplePartitioner.getReadsForSample(sample);
@ -271,11 +280,11 @@ class ReadStateManager {
if (reads.isEmpty()) if (reads.isEmpty())
return; return;
Collection<AlignmentStateMachine> newReadStates = new LinkedList<AlignmentStateMachine>(); final LinkedList<AlignmentStateMachine> newReadStates = new LinkedList<AlignmentStateMachine>();
for (GATKSAMRecord read : reads) { for (final GATKSAMRecord read : reads) {
AlignmentStateMachine state = new AlignmentStateMachine(read); final AlignmentStateMachine state = new AlignmentStateMachine(read);
if ( state.stepForwardOnGenome() != null ) if ( state.stepForwardOnGenome() != null ) // todo -- should be an assertion not a skip
// explicitly filter out reads that are all insertions / soft clips // explicitly filter out reads that are all insertions / soft clips
newReadStates.add(state); newReadStates.add(state);
} }
@ -283,6 +292,7 @@ class ReadStateManager {
readStates.addStatesAtNextAlignmentStart(newReadStates); readStates.addStatesAtNextAlignmentStart(newReadStates);
} }
// TODO -- refactor into separate class with pointer to ReadStateManager for updates to the total counts
protected class PerSampleReadStateManager implements Iterable<AlignmentStateMachine> { protected class PerSampleReadStateManager implements Iterable<AlignmentStateMachine> {
private List<LinkedList<AlignmentStateMachine>> readStatesByAlignmentStart = new LinkedList<LinkedList<AlignmentStateMachine>>(); private List<LinkedList<AlignmentStateMachine>> readStatesByAlignmentStart = new LinkedList<LinkedList<AlignmentStateMachine>>();
private final Downsampler<LinkedList<AlignmentStateMachine>> levelingDownsampler; private final Downsampler<LinkedList<AlignmentStateMachine>> levelingDownsampler;
@ -295,12 +305,16 @@ class ReadStateManager {
: null; : null;
} }
public void addStatesAtNextAlignmentStart(Collection<AlignmentStateMachine> states) { /**
* Assumes it can just keep the states linked lists without making a copy
* @param states
*/
public void addStatesAtNextAlignmentStart(LinkedList<AlignmentStateMachine> states) {
if ( states.isEmpty() ) { if ( states.isEmpty() ) {
return; return;
} }
readStatesByAlignmentStart.add(new LinkedList<AlignmentStateMachine>(states)); readStatesByAlignmentStart.add(states);
thisSampleReadStates += states.size(); thisSampleReadStates += states.size();
totalReadStates += states.size(); totalReadStates += states.size();

View File

@ -71,7 +71,7 @@ public class ReadStateManagerUnitTest extends LocusIteratorByStateBaseTest {
makeReads(); makeReads();
for ( ArrayList<AlignmentStateMachine> stackRecordStates : recordStatesByAlignmentStart ) { for ( ArrayList<AlignmentStateMachine> stackRecordStates : recordStatesByAlignmentStart ) {
perSampleReadStateManager.addStatesAtNextAlignmentStart(stackRecordStates); perSampleReadStateManager.addStatesAtNextAlignmentStart(new LinkedList<AlignmentStateMachine>(stackRecordStates));
} }
// read state manager should have the right number of reads // read state manager should have the right number of reads