LIBS optimizations and performance tools
-- Made LIBSPerformance a full featured CommandLineProgram, and it can be used to assess the LIBS performance by reading a provided BAM -- ReadStateManager now provides a clean interface to iterate in sample order the per-sample read states, allowing us to avoid many map.get calls -- Moved updateReadStates to ReadStateManager -- Removed the unnecessary wrapping of an iterator in ReadStateManager -- readStatesBySample is now a LinkedHashMap so that iteration occurs in LIBS sample order, allowing us to avoid many unnecessary calls to map.get iterating over samples. Now those are just map native iterations -- Restructured collectPendingReads for simplicity, removing redundant and consolidating common range checks. The new piece is code is much clearer and avoids several unnecessary function calls
This commit is contained in:
parent
ec05ecef60
commit
83fcc06e28
|
|
@ -113,6 +113,16 @@ public class AlignmentStateMachine {
|
||||||
return read;
|
return read;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Get the reference index of the underlying read
|
||||||
|
*
|
||||||
|
* @return the reference index of the read
|
||||||
|
*/
|
||||||
|
@Ensures("result == getRead().getReferenceIndex()")
|
||||||
|
public int getReferenceIndex() {
|
||||||
|
return getRead().getReferenceIndex();
|
||||||
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Is this the left edge state? I.e., one that is before or after the current read?
|
* Is this the left edge state? I.e., one that is before or after the current read?
|
||||||
* @return true if this state is an edge state, false otherwise
|
* @return true if this state is an edge state, false otherwise
|
||||||
|
|
|
||||||
|
|
@ -34,8 +34,7 @@ import org.broadinstitute.sting.gatk.contexts.AlignmentContext;
|
||||||
import org.broadinstitute.sting.gatk.downsampling.DownsampleType;
|
import org.broadinstitute.sting.gatk.downsampling.DownsampleType;
|
||||||
import org.broadinstitute.sting.utils.GenomeLoc;
|
import org.broadinstitute.sting.utils.GenomeLoc;
|
||||||
import org.broadinstitute.sting.utils.GenomeLocParser;
|
import org.broadinstitute.sting.utils.GenomeLocParser;
|
||||||
import org.broadinstitute.sting.utils.pileup.PileupElement;
|
import org.broadinstitute.sting.utils.pileup.*;
|
||||||
import org.broadinstitute.sting.utils.pileup.ReadBackedPileupImpl;
|
|
||||||
import org.broadinstitute.sting.utils.sam.GATKSAMRecord;
|
import org.broadinstitute.sting.utils.sam.GATKSAMRecord;
|
||||||
import org.broadinstitute.sting.utils.sam.ReadUtils;
|
import org.broadinstitute.sting.utils.sam.ReadUtils;
|
||||||
|
|
||||||
|
|
@ -234,17 +233,16 @@ public class LocusIteratorByState extends LocusIterator {
|
||||||
final GenomeLoc location = getLocation();
|
final GenomeLoc location = getLocation();
|
||||||
final Map<String, ReadBackedPileupImpl> fullPileup = new HashMap<String, ReadBackedPileupImpl>();
|
final Map<String, ReadBackedPileupImpl> fullPileup = new HashMap<String, ReadBackedPileupImpl>();
|
||||||
|
|
||||||
// TODO: How can you determine here whether the current pileup has been downsampled?
|
for (final Map.Entry<String, ReadStateManager.PerSampleReadStateManager> sampleStatePair : readStates ) {
|
||||||
boolean hasBeenSampled = false;
|
final String sample = sampleStatePair.getKey();
|
||||||
|
final ReadStateManager.PerSampleReadStateManager readState = sampleStatePair.getValue();
|
||||||
for (final String sample : samples) {
|
final Iterator<AlignmentStateMachine> iterator = readState.iterator();
|
||||||
final Iterator<AlignmentStateMachine> iterator = readStates.iterator(sample);
|
final List<PileupElement> pile = new ArrayList<PileupElement>(readState.size());
|
||||||
final List<PileupElement> pile = new ArrayList<PileupElement>(readStates.size(sample));
|
|
||||||
|
|
||||||
while (iterator.hasNext()) {
|
while (iterator.hasNext()) {
|
||||||
// state object with the read/offset information
|
// state object with the read/offset information
|
||||||
final AlignmentStateMachine state = iterator.next();
|
final AlignmentStateMachine state = iterator.next();
|
||||||
final GATKSAMRecord read = (GATKSAMRecord) state.getRead();
|
final GATKSAMRecord read = state.getRead();
|
||||||
final CigarOperator op = state.getCigarOperator();
|
final CigarOperator op = state.getCigarOperator();
|
||||||
|
|
||||||
if (op == CigarOperator.N) // N's are never added to any pileup
|
if (op == CigarOperator.N) // N's are never added to any pileup
|
||||||
|
|
@ -263,29 +261,9 @@ public class LocusIteratorByState extends LocusIterator {
|
||||||
fullPileup.put(sample, new ReadBackedPileupImpl(location, pile));
|
fullPileup.put(sample, new ReadBackedPileupImpl(location, pile));
|
||||||
}
|
}
|
||||||
|
|
||||||
updateReadStates(); // critical - must be called after we get the current state offsets and location
|
readStates.updateReadStates(); // critical - must be called after we get the current state offsets and location
|
||||||
if (!fullPileup.isEmpty()) // if we got reads with non-D/N over the current position, we are done
|
if (!fullPileup.isEmpty()) // if we got reads with non-D/N over the current position, we are done
|
||||||
nextAlignmentContext = new AlignmentContext(location, new ReadBackedPileupImpl(location, fullPileup), hasBeenSampled);
|
nextAlignmentContext = new AlignmentContext(location, new ReadBackedPileupImpl(location, fullPileup), false);
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
/**
|
|
||||||
* Advances all fo the read states by one bp. After this call the read states are reflective
|
|
||||||
* of the next pileup.
|
|
||||||
*/
|
|
||||||
private void updateReadStates() {
|
|
||||||
for (final String sample : samples) {
|
|
||||||
Iterator<AlignmentStateMachine> it = readStates.iterator(sample);
|
|
||||||
while (it.hasNext()) {
|
|
||||||
AlignmentStateMachine state = it.next();
|
|
||||||
CigarOperator op = state.stepForwardOnGenome();
|
|
||||||
if (op == null) {
|
|
||||||
// we discard the read only when we are past its end AND indel at the end of the read (if any) was
|
|
||||||
// already processed. Keeping the read state that returned null upon stepForwardOnGenome() is safe
|
|
||||||
// as the next call to stepForwardOnGenome() will return null again AND will clear hadIndel() flag.
|
|
||||||
it.remove(); // we've stepped off the end of the object
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -28,6 +28,7 @@ package org.broadinstitute.sting.utils.locusiterator;
|
||||||
import com.google.java.contract.Ensures;
|
import com.google.java.contract.Ensures;
|
||||||
import com.google.java.contract.Requires;
|
import com.google.java.contract.Requires;
|
||||||
import net.sf.picard.util.PeekableIterator;
|
import net.sf.picard.util.PeekableIterator;
|
||||||
|
import net.sf.samtools.CigarOperator;
|
||||||
import org.broadinstitute.sting.gatk.downsampling.Downsampler;
|
import org.broadinstitute.sting.gatk.downsampling.Downsampler;
|
||||||
import org.broadinstitute.sting.gatk.downsampling.LevelingDownsampler;
|
import org.broadinstitute.sting.gatk.downsampling.LevelingDownsampler;
|
||||||
import org.broadinstitute.sting.utils.sam.GATKSAMRecord;
|
import org.broadinstitute.sting.utils.sam.GATKSAMRecord;
|
||||||
|
|
@ -48,11 +49,18 @@ import java.util.*;
|
||||||
* Date: 1/5/13
|
* Date: 1/5/13
|
||||||
* Time: 2:02 PM
|
* Time: 2:02 PM
|
||||||
*/
|
*/
|
||||||
class ReadStateManager {
|
final class ReadStateManager implements Iterable<Map.Entry<String, ReadStateManager.PerSampleReadStateManager>> {
|
||||||
private final List<String> samples;
|
private final List<String> samples;
|
||||||
private final PeekableIterator<GATKSAMRecord> iterator;
|
private final PeekableIterator<GATKSAMRecord> iterator;
|
||||||
private final SamplePartitioner<GATKSAMRecord> samplePartitioner;
|
private final SamplePartitioner<GATKSAMRecord> samplePartitioner;
|
||||||
private final Map<String, PerSampleReadStateManager> readStatesBySample = new HashMap<String, PerSampleReadStateManager>();
|
|
||||||
|
/**
|
||||||
|
* A mapping from sample name -> the per sample read state manager that manages
|
||||||
|
*
|
||||||
|
* IT IS CRITICAL THAT THIS BE A LINKED HASH MAP, SO THAT THE ITERATION OF THE MAP OCCURS IN THE SAME
|
||||||
|
* ORDER AS THE ORIGINL SAMPLES
|
||||||
|
*/
|
||||||
|
private final Map<String, PerSampleReadStateManager> readStatesBySample = new LinkedHashMap<String, PerSampleReadStateManager>();
|
||||||
|
|
||||||
private LinkedList<GATKSAMRecord> submittedReads;
|
private LinkedList<GATKSAMRecord> submittedReads;
|
||||||
private final boolean keepSubmittedReads;
|
private final boolean keepSubmittedReads;
|
||||||
|
|
@ -70,6 +78,7 @@ class ReadStateManager {
|
||||||
this.submittedReads = new LinkedList<GATKSAMRecord>();
|
this.submittedReads = new LinkedList<GATKSAMRecord>();
|
||||||
|
|
||||||
for (final String sample : samples) {
|
for (final String sample : samples) {
|
||||||
|
// because this is a linked hash map the order of iteration will be in sample order
|
||||||
readStatesBySample.put(sample, new PerSampleReadStateManager(LIBSDownsamplingInfo));
|
readStatesBySample.put(sample, new PerSampleReadStateManager(LIBSDownsamplingInfo));
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
@ -77,29 +86,16 @@ class ReadStateManager {
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Returns a iterator over all the reads associated with the given sample. Note that remove() is implemented
|
* Returns a iterator over all the sample -> per-sample read state managers with each sample in this read state manager.
|
||||||
* for this iterator; if present, total read states will be decremented.
|
|
||||||
*
|
*
|
||||||
* @param sample The sample.
|
* The order of iteration is the same as the order of the samples provided upon construction to this
|
||||||
* @return Iterator over the reads associated with that sample.
|
* ReadStateManager.
|
||||||
|
*
|
||||||
|
* @return Iterator over sample + per sample read state manager pairs for this read state manager.
|
||||||
*/
|
*/
|
||||||
public Iterator<AlignmentStateMachine> iterator(final String sample) {
|
@Override
|
||||||
// TODO -- why is this wrapped?
|
public Iterator<Map.Entry<String, ReadStateManager.PerSampleReadStateManager>> iterator() {
|
||||||
return new Iterator<AlignmentStateMachine>() {
|
return readStatesBySample.entrySet().iterator();
|
||||||
private Iterator<AlignmentStateMachine> wrappedIterator = readStatesBySample.get(sample).iterator();
|
|
||||||
|
|
||||||
public boolean hasNext() {
|
|
||||||
return wrappedIterator.hasNext();
|
|
||||||
}
|
|
||||||
|
|
||||||
public AlignmentStateMachine next() {
|
|
||||||
return wrappedIterator.next();
|
|
||||||
}
|
|
||||||
|
|
||||||
public void remove() {
|
|
||||||
wrappedIterator.remove();
|
|
||||||
}
|
|
||||||
};
|
|
||||||
}
|
}
|
||||||
|
|
||||||
public boolean isEmpty() {
|
public boolean isEmpty() {
|
||||||
|
|
@ -126,10 +122,9 @@ class ReadStateManager {
|
||||||
}
|
}
|
||||||
|
|
||||||
public AlignmentStateMachine getFirst() {
|
public AlignmentStateMachine getFirst() {
|
||||||
for (final String sample : samples) {
|
for ( final PerSampleReadStateManager manager : readStatesBySample.values() ) {
|
||||||
PerSampleReadStateManager reads = readStatesBySample.get(sample);
|
if ( ! manager.isEmpty() )
|
||||||
if (!reads.isEmpty())
|
return manager.peek();
|
||||||
return reads.peek();
|
|
||||||
}
|
}
|
||||||
return null;
|
return null;
|
||||||
}
|
}
|
||||||
|
|
@ -138,51 +133,65 @@ class ReadStateManager {
|
||||||
return totalReadStates > 0 || iterator.hasNext();
|
return totalReadStates > 0 || iterator.hasNext();
|
||||||
}
|
}
|
||||||
|
|
||||||
// fast testing of position
|
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* TODO -- this function needs to be optimized
|
* Advances all fo the read states by one bp. After this call the read states are reflective
|
||||||
*
|
* of the next pileup.
|
||||||
* Notes:
|
|
||||||
* -- the only place where it's called is in a block where we know isEmpty is false
|
|
||||||
* -- getFirst() is quite expensive, and it seems that we could cache this value in the outer
|
|
||||||
* block, and then pass this in as an argument
|
|
||||||
*
|
|
||||||
* @param read
|
|
||||||
* @return
|
|
||||||
*/
|
*/
|
||||||
private boolean readIsPastCurrentPosition(GATKSAMRecord read) {
|
public void updateReadStates() {
|
||||||
if (isEmpty())
|
for (final PerSampleReadStateManager readStateManager : readStatesBySample.values() ) {
|
||||||
return false;
|
final Iterator<AlignmentStateMachine> it = readStateManager.iterator();
|
||||||
else {
|
while (it.hasNext()) {
|
||||||
final AlignmentStateMachine state = getFirst();
|
final AlignmentStateMachine state = it.next();
|
||||||
final GATKSAMRecord ourRead = state.getRead();
|
final CigarOperator op = state.stepForwardOnGenome();
|
||||||
return read.getReferenceIndex() > ourRead.getReferenceIndex() || read.getAlignmentStart() > state.getGenomePosition();
|
if (op == null) {
|
||||||
|
// we discard the read only when we are past its end AND indel at the end of the read (if any) was
|
||||||
|
// already processed. Keeping the read state that returned null upon stepForwardOnGenome() is safe
|
||||||
|
// as the next call to stepForwardOnGenome() will return null again AND will clear hadIndel() flag.
|
||||||
|
it.remove(); // we've stepped off the end of the object
|
||||||
|
}
|
||||||
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Does read start at the same position as described by currentContextIndex and currentAlignmentStart?
|
||||||
|
*
|
||||||
|
* @param read the read we want to test
|
||||||
|
* @param currentContigIndex the contig index (from the read's getReferenceIndex) of the reads in this state manager
|
||||||
|
* @param currentAlignmentStart the alignment start of the of the left-most position on the
|
||||||
|
* genome of the reads in this read state manager
|
||||||
|
* @return true if read has contig index and start equal to the current ones
|
||||||
|
*/
|
||||||
|
private boolean readStartsAtCurrentPosition(final GATKSAMRecord read, final int currentContigIndex, final int currentAlignmentStart) {
|
||||||
|
return read.getAlignmentStart() == currentAlignmentStart && read.getReferenceIndex() == currentContigIndex;
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Pull all of the reads off the iterator that overlap the left-most position among all
|
||||||
|
* reads this ReadStateManager
|
||||||
|
*/
|
||||||
public void collectPendingReads() {
|
public void collectPendingReads() {
|
||||||
if (!iterator.hasNext())
|
if (!iterator.hasNext())
|
||||||
return;
|
return;
|
||||||
|
|
||||||
// the next record in the stream, peeked as to not remove it from the stream
|
// determine the left-most boundary that determines which reads to keep in this new pileup
|
||||||
|
final int firstContigIndex;
|
||||||
|
final int firstAlignmentStart;
|
||||||
if ( isEmpty() ) {
|
if ( isEmpty() ) {
|
||||||
final int firstContigIndex = iterator.peek().getReferenceIndex();
|
// there are no reads here, so our next state is the next read in the stream
|
||||||
final int firstAlignmentStart = iterator.peek().getAlignmentStart();
|
firstContigIndex = iterator.peek().getReferenceIndex();
|
||||||
while (iterator.hasNext() && iterator.peek().getReferenceIndex() == firstContigIndex && iterator.peek().getAlignmentStart() == firstAlignmentStart) {
|
firstAlignmentStart = iterator.peek().getAlignmentStart();
|
||||||
submitRead(iterator.next());
|
|
||||||
}
|
|
||||||
} else {
|
} else {
|
||||||
// Fast fail in the case that the read is past the current position.
|
// there's a read in the system, so it's our targeted first read
|
||||||
if (readIsPastCurrentPosition(iterator.peek()))
|
final AlignmentStateMachine firstState = getFirst();
|
||||||
return;
|
firstContigIndex = firstState.getReferenceIndex();
|
||||||
|
// note this isn't the alignment start of the read, but rather the alignment start position
|
||||||
while (iterator.hasNext() && !readIsPastCurrentPosition(iterator.peek())) {
|
firstAlignmentStart = firstState.getGenomePosition();
|
||||||
submitRead(iterator.next());
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
||||||
samplePartitioner.doneSubmittingReads();
|
while ( iterator.hasNext() && readStartsAtCurrentPosition(iterator.peek(), firstContigIndex, firstAlignmentStart) ) {
|
||||||
|
submitRead(iterator.next());
|
||||||
|
}
|
||||||
|
|
||||||
for (final String sample : samples) {
|
for (final String sample : samples) {
|
||||||
final Collection<GATKSAMRecord> newReads = samplePartitioner.getReadsForSample(sample);
|
final Collection<GATKSAMRecord> newReads = samplePartitioner.getReadsForSample(sample);
|
||||||
|
|
@ -271,11 +280,11 @@ class ReadStateManager {
|
||||||
if (reads.isEmpty())
|
if (reads.isEmpty())
|
||||||
return;
|
return;
|
||||||
|
|
||||||
Collection<AlignmentStateMachine> newReadStates = new LinkedList<AlignmentStateMachine>();
|
final LinkedList<AlignmentStateMachine> newReadStates = new LinkedList<AlignmentStateMachine>();
|
||||||
|
|
||||||
for (GATKSAMRecord read : reads) {
|
for (final GATKSAMRecord read : reads) {
|
||||||
AlignmentStateMachine state = new AlignmentStateMachine(read);
|
final AlignmentStateMachine state = new AlignmentStateMachine(read);
|
||||||
if ( state.stepForwardOnGenome() != null )
|
if ( state.stepForwardOnGenome() != null ) // todo -- should be an assertion not a skip
|
||||||
// explicitly filter out reads that are all insertions / soft clips
|
// explicitly filter out reads that are all insertions / soft clips
|
||||||
newReadStates.add(state);
|
newReadStates.add(state);
|
||||||
}
|
}
|
||||||
|
|
@ -283,6 +292,7 @@ class ReadStateManager {
|
||||||
readStates.addStatesAtNextAlignmentStart(newReadStates);
|
readStates.addStatesAtNextAlignmentStart(newReadStates);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// TODO -- refactor into separate class with pointer to ReadStateManager for updates to the total counts
|
||||||
protected class PerSampleReadStateManager implements Iterable<AlignmentStateMachine> {
|
protected class PerSampleReadStateManager implements Iterable<AlignmentStateMachine> {
|
||||||
private List<LinkedList<AlignmentStateMachine>> readStatesByAlignmentStart = new LinkedList<LinkedList<AlignmentStateMachine>>();
|
private List<LinkedList<AlignmentStateMachine>> readStatesByAlignmentStart = new LinkedList<LinkedList<AlignmentStateMachine>>();
|
||||||
private final Downsampler<LinkedList<AlignmentStateMachine>> levelingDownsampler;
|
private final Downsampler<LinkedList<AlignmentStateMachine>> levelingDownsampler;
|
||||||
|
|
@ -295,12 +305,16 @@ class ReadStateManager {
|
||||||
: null;
|
: null;
|
||||||
}
|
}
|
||||||
|
|
||||||
public void addStatesAtNextAlignmentStart(Collection<AlignmentStateMachine> states) {
|
/**
|
||||||
|
* Assumes it can just keep the states linked lists without making a copy
|
||||||
|
* @param states
|
||||||
|
*/
|
||||||
|
public void addStatesAtNextAlignmentStart(LinkedList<AlignmentStateMachine> states) {
|
||||||
if ( states.isEmpty() ) {
|
if ( states.isEmpty() ) {
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
|
|
||||||
readStatesByAlignmentStart.add(new LinkedList<AlignmentStateMachine>(states));
|
readStatesByAlignmentStart.add(states);
|
||||||
thisSampleReadStates += states.size();
|
thisSampleReadStates += states.size();
|
||||||
totalReadStates += states.size();
|
totalReadStates += states.size();
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -71,7 +71,7 @@ public class ReadStateManagerUnitTest extends LocusIteratorByStateBaseTest {
|
||||||
makeReads();
|
makeReads();
|
||||||
|
|
||||||
for ( ArrayList<AlignmentStateMachine> stackRecordStates : recordStatesByAlignmentStart ) {
|
for ( ArrayList<AlignmentStateMachine> stackRecordStates : recordStatesByAlignmentStart ) {
|
||||||
perSampleReadStateManager.addStatesAtNextAlignmentStart(stackRecordStates);
|
perSampleReadStateManager.addStatesAtNextAlignmentStart(new LinkedList<AlignmentStateMachine>(stackRecordStates));
|
||||||
}
|
}
|
||||||
|
|
||||||
// read state manager should have the right number of reads
|
// read state manager should have the right number of reads
|
||||||
|
|
|
||||||
Loading…
Reference in New Issue