diff --git a/public/java/src/org/broadinstitute/sting/utils/locusiterator/AlignmentStateMachine.java b/public/java/src/org/broadinstitute/sting/utils/locusiterator/AlignmentStateMachine.java index 32e56866b..50bc9e25b 100644 --- a/public/java/src/org/broadinstitute/sting/utils/locusiterator/AlignmentStateMachine.java +++ b/public/java/src/org/broadinstitute/sting/utils/locusiterator/AlignmentStateMachine.java @@ -113,6 +113,16 @@ public class AlignmentStateMachine { return read; } + /** + * Get the reference index of the underlying read + * + * @return the reference index of the read + */ + @Ensures("result == getRead().getReferenceIndex()") + public int getReferenceIndex() { + return getRead().getReferenceIndex(); + } + /** * Is this the left edge state? I.e., one that is before or after the current read? * @return true if this state is an edge state, false otherwise diff --git a/public/java/src/org/broadinstitute/sting/utils/locusiterator/LocusIteratorByState.java b/public/java/src/org/broadinstitute/sting/utils/locusiterator/LocusIteratorByState.java index 01c9e564e..9499bfa35 100644 --- a/public/java/src/org/broadinstitute/sting/utils/locusiterator/LocusIteratorByState.java +++ b/public/java/src/org/broadinstitute/sting/utils/locusiterator/LocusIteratorByState.java @@ -34,8 +34,7 @@ import org.broadinstitute.sting.gatk.contexts.AlignmentContext; import org.broadinstitute.sting.gatk.downsampling.DownsampleType; import org.broadinstitute.sting.utils.GenomeLoc; import org.broadinstitute.sting.utils.GenomeLocParser; -import org.broadinstitute.sting.utils.pileup.PileupElement; -import org.broadinstitute.sting.utils.pileup.ReadBackedPileupImpl; +import org.broadinstitute.sting.utils.pileup.*; import org.broadinstitute.sting.utils.sam.GATKSAMRecord; import org.broadinstitute.sting.utils.sam.ReadUtils; @@ -234,17 +233,16 @@ public class LocusIteratorByState extends LocusIterator { final GenomeLoc location = getLocation(); final Map fullPileup = new HashMap(); - // TODO: How can you determine here whether the current pileup has been downsampled? - boolean hasBeenSampled = false; - - for (final String sample : samples) { - final Iterator iterator = readStates.iterator(sample); - final List pile = new ArrayList(readStates.size(sample)); + for (final Map.Entry sampleStatePair : readStates ) { + final String sample = sampleStatePair.getKey(); + final ReadStateManager.PerSampleReadStateManager readState = sampleStatePair.getValue(); + final Iterator iterator = readState.iterator(); + final List pile = new ArrayList(readState.size()); while (iterator.hasNext()) { // state object with the read/offset information final AlignmentStateMachine state = iterator.next(); - final GATKSAMRecord read = (GATKSAMRecord) state.getRead(); + final GATKSAMRecord read = state.getRead(); final CigarOperator op = state.getCigarOperator(); if (op == CigarOperator.N) // N's are never added to any pileup @@ -263,29 +261,9 @@ public class LocusIteratorByState extends LocusIterator { fullPileup.put(sample, new ReadBackedPileupImpl(location, pile)); } - updateReadStates(); // critical - must be called after we get the current state offsets and location + readStates.updateReadStates(); // critical - must be called after we get the current state offsets and location if (!fullPileup.isEmpty()) // if we got reads with non-D/N over the current position, we are done - nextAlignmentContext = new AlignmentContext(location, new ReadBackedPileupImpl(location, fullPileup), hasBeenSampled); - } - } - - /** - * Advances all fo the read states by one bp. After this call the read states are reflective - * of the next pileup. - */ - private void updateReadStates() { - for (final String sample : samples) { - Iterator it = readStates.iterator(sample); - while (it.hasNext()) { - AlignmentStateMachine state = it.next(); - CigarOperator op = state.stepForwardOnGenome(); - if (op == null) { - // we discard the read only when we are past its end AND indel at the end of the read (if any) was - // already processed. Keeping the read state that returned null upon stepForwardOnGenome() is safe - // as the next call to stepForwardOnGenome() will return null again AND will clear hadIndel() flag. - it.remove(); // we've stepped off the end of the object - } - } + nextAlignmentContext = new AlignmentContext(location, new ReadBackedPileupImpl(location, fullPileup), false); } } diff --git a/public/java/src/org/broadinstitute/sting/utils/locusiterator/ReadStateManager.java b/public/java/src/org/broadinstitute/sting/utils/locusiterator/ReadStateManager.java index 2dcf01d72..0a8d3a108 100644 --- a/public/java/src/org/broadinstitute/sting/utils/locusiterator/ReadStateManager.java +++ b/public/java/src/org/broadinstitute/sting/utils/locusiterator/ReadStateManager.java @@ -28,6 +28,7 @@ package org.broadinstitute.sting.utils.locusiterator; import com.google.java.contract.Ensures; import com.google.java.contract.Requires; import net.sf.picard.util.PeekableIterator; +import net.sf.samtools.CigarOperator; import org.broadinstitute.sting.gatk.downsampling.Downsampler; import org.broadinstitute.sting.gatk.downsampling.LevelingDownsampler; import org.broadinstitute.sting.utils.sam.GATKSAMRecord; @@ -48,11 +49,18 @@ import java.util.*; * Date: 1/5/13 * Time: 2:02 PM */ -class ReadStateManager { +final class ReadStateManager implements Iterable> { private final List samples; private final PeekableIterator iterator; private final SamplePartitioner samplePartitioner; - private final Map readStatesBySample = new HashMap(); + + /** + * A mapping from sample name -> the per sample read state manager that manages + * + * IT IS CRITICAL THAT THIS BE A LINKED HASH MAP, SO THAT THE ITERATION OF THE MAP OCCURS IN THE SAME + * ORDER AS THE ORIGINL SAMPLES + */ + private final Map readStatesBySample = new LinkedHashMap(); private LinkedList submittedReads; private final boolean keepSubmittedReads; @@ -70,6 +78,7 @@ class ReadStateManager { this.submittedReads = new LinkedList(); for (final String sample : samples) { + // because this is a linked hash map the order of iteration will be in sample order readStatesBySample.put(sample, new PerSampleReadStateManager(LIBSDownsamplingInfo)); } @@ -77,29 +86,16 @@ class ReadStateManager { } /** - * Returns a iterator over all the reads associated with the given sample. Note that remove() is implemented - * for this iterator; if present, total read states will be decremented. + * Returns a iterator over all the sample -> per-sample read state managers with each sample in this read state manager. * - * @param sample The sample. - * @return Iterator over the reads associated with that sample. + * The order of iteration is the same as the order of the samples provided upon construction to this + * ReadStateManager. + * + * @return Iterator over sample + per sample read state manager pairs for this read state manager. */ - public Iterator iterator(final String sample) { - // TODO -- why is this wrapped? - return new Iterator() { - private Iterator wrappedIterator = readStatesBySample.get(sample).iterator(); - - public boolean hasNext() { - return wrappedIterator.hasNext(); - } - - public AlignmentStateMachine next() { - return wrappedIterator.next(); - } - - public void remove() { - wrappedIterator.remove(); - } - }; + @Override + public Iterator> iterator() { + return readStatesBySample.entrySet().iterator(); } public boolean isEmpty() { @@ -126,10 +122,9 @@ class ReadStateManager { } public AlignmentStateMachine getFirst() { - for (final String sample : samples) { - PerSampleReadStateManager reads = readStatesBySample.get(sample); - if (!reads.isEmpty()) - return reads.peek(); + for ( final PerSampleReadStateManager manager : readStatesBySample.values() ) { + if ( ! manager.isEmpty() ) + return manager.peek(); } return null; } @@ -138,51 +133,65 @@ class ReadStateManager { return totalReadStates > 0 || iterator.hasNext(); } - // fast testing of position - /** - * TODO -- this function needs to be optimized - * - * Notes: - * -- the only place where it's called is in a block where we know isEmpty is false - * -- getFirst() is quite expensive, and it seems that we could cache this value in the outer - * block, and then pass this in as an argument - * - * @param read - * @return + * Advances all fo the read states by one bp. After this call the read states are reflective + * of the next pileup. */ - private boolean readIsPastCurrentPosition(GATKSAMRecord read) { - if (isEmpty()) - return false; - else { - final AlignmentStateMachine state = getFirst(); - final GATKSAMRecord ourRead = state.getRead(); - return read.getReferenceIndex() > ourRead.getReferenceIndex() || read.getAlignmentStart() > state.getGenomePosition(); + public void updateReadStates() { + for (final PerSampleReadStateManager readStateManager : readStatesBySample.values() ) { + final Iterator it = readStateManager.iterator(); + while (it.hasNext()) { + final AlignmentStateMachine state = it.next(); + final CigarOperator op = state.stepForwardOnGenome(); + if (op == null) { + // we discard the read only when we are past its end AND indel at the end of the read (if any) was + // already processed. Keeping the read state that returned null upon stepForwardOnGenome() is safe + // as the next call to stepForwardOnGenome() will return null again AND will clear hadIndel() flag. + it.remove(); // we've stepped off the end of the object + } + } } } + /** + * Does read start at the same position as described by currentContextIndex and currentAlignmentStart? + * + * @param read the read we want to test + * @param currentContigIndex the contig index (from the read's getReferenceIndex) of the reads in this state manager + * @param currentAlignmentStart the alignment start of the of the left-most position on the + * genome of the reads in this read state manager + * @return true if read has contig index and start equal to the current ones + */ + private boolean readStartsAtCurrentPosition(final GATKSAMRecord read, final int currentContigIndex, final int currentAlignmentStart) { + return read.getAlignmentStart() == currentAlignmentStart && read.getReferenceIndex() == currentContigIndex; + } + + /** + * Pull all of the reads off the iterator that overlap the left-most position among all + * reads this ReadStateManager + */ public void collectPendingReads() { if (!iterator.hasNext()) return; - // the next record in the stream, peeked as to not remove it from the stream + // determine the left-most boundary that determines which reads to keep in this new pileup + final int firstContigIndex; + final int firstAlignmentStart; if ( isEmpty() ) { - final int firstContigIndex = iterator.peek().getReferenceIndex(); - final int firstAlignmentStart = iterator.peek().getAlignmentStart(); - while (iterator.hasNext() && iterator.peek().getReferenceIndex() == firstContigIndex && iterator.peek().getAlignmentStart() == firstAlignmentStart) { - submitRead(iterator.next()); - } + // there are no reads here, so our next state is the next read in the stream + firstContigIndex = iterator.peek().getReferenceIndex(); + firstAlignmentStart = iterator.peek().getAlignmentStart(); } else { - // Fast fail in the case that the read is past the current position. - if (readIsPastCurrentPosition(iterator.peek())) - return; - - while (iterator.hasNext() && !readIsPastCurrentPosition(iterator.peek())) { - submitRead(iterator.next()); - } + // there's a read in the system, so it's our targeted first read + final AlignmentStateMachine firstState = getFirst(); + firstContigIndex = firstState.getReferenceIndex(); + // note this isn't the alignment start of the read, but rather the alignment start position + firstAlignmentStart = firstState.getGenomePosition(); } - samplePartitioner.doneSubmittingReads(); + while ( iterator.hasNext() && readStartsAtCurrentPosition(iterator.peek(), firstContigIndex, firstAlignmentStart) ) { + submitRead(iterator.next()); + } for (final String sample : samples) { final Collection newReads = samplePartitioner.getReadsForSample(sample); @@ -271,11 +280,11 @@ class ReadStateManager { if (reads.isEmpty()) return; - Collection newReadStates = new LinkedList(); + final LinkedList newReadStates = new LinkedList(); - for (GATKSAMRecord read : reads) { - AlignmentStateMachine state = new AlignmentStateMachine(read); - if ( state.stepForwardOnGenome() != null ) + for (final GATKSAMRecord read : reads) { + final AlignmentStateMachine state = new AlignmentStateMachine(read); + if ( state.stepForwardOnGenome() != null ) // todo -- should be an assertion not a skip // explicitly filter out reads that are all insertions / soft clips newReadStates.add(state); } @@ -283,6 +292,7 @@ class ReadStateManager { readStates.addStatesAtNextAlignmentStart(newReadStates); } + // TODO -- refactor into separate class with pointer to ReadStateManager for updates to the total counts protected class PerSampleReadStateManager implements Iterable { private List> readStatesByAlignmentStart = new LinkedList>(); private final Downsampler> levelingDownsampler; @@ -295,12 +305,16 @@ class ReadStateManager { : null; } - public void addStatesAtNextAlignmentStart(Collection states) { + /** + * Assumes it can just keep the states linked lists without making a copy + * @param states + */ + public void addStatesAtNextAlignmentStart(LinkedList states) { if ( states.isEmpty() ) { return; } - readStatesByAlignmentStart.add(new LinkedList(states)); + readStatesByAlignmentStart.add(states); thisSampleReadStates += states.size(); totalReadStates += states.size(); diff --git a/public/java/test/org/broadinstitute/sting/utils/locusiterator/ReadStateManagerUnitTest.java b/public/java/test/org/broadinstitute/sting/utils/locusiterator/ReadStateManagerUnitTest.java index 1db0605c7..76b324d85 100644 --- a/public/java/test/org/broadinstitute/sting/utils/locusiterator/ReadStateManagerUnitTest.java +++ b/public/java/test/org/broadinstitute/sting/utils/locusiterator/ReadStateManagerUnitTest.java @@ -71,7 +71,7 @@ public class ReadStateManagerUnitTest extends LocusIteratorByStateBaseTest { makeReads(); for ( ArrayList stackRecordStates : recordStatesByAlignmentStart ) { - perSampleReadStateManager.addStatesAtNextAlignmentStart(stackRecordStates); + perSampleReadStateManager.addStatesAtNextAlignmentStart(new LinkedList(stackRecordStates)); } // read state manager should have the right number of reads