From e77f76f8e16ad88a03a723174da90de71a2ff79f Mon Sep 17 00:00:00 2001 From: hanna Date: Wed, 16 Jun 2010 17:23:27 +0000 Subject: [PATCH] Reenabled downsampling by sample after basic sanity testing and fixes of the new implementation. Hard testing and performance enhancements are still pending. git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@3566 348d0f76-0448-11de-a6fe-93d51630548a --- .../DownsamplingLocusIteratorByState.java | 213 ++++++++++++++---- 1 file changed, 167 insertions(+), 46 deletions(-) diff --git a/java/src/org/broadinstitute/sting/gatk/iterators/DownsamplingLocusIteratorByState.java b/java/src/org/broadinstitute/sting/gatk/iterators/DownsamplingLocusIteratorByState.java index 1d10980a7..1bb75d188 100755 --- a/java/src/org/broadinstitute/sting/gatk/iterators/DownsamplingLocusIteratorByState.java +++ b/java/src/org/broadinstitute/sting/gatk/iterators/DownsamplingLocusIteratorByState.java @@ -408,6 +408,7 @@ public class DownsamplingLocusIteratorByState extends LocusIterator { if ( state.getRead().getMappingQuality() == 0 ) { nMQ0Reads++; } + // TODO: sample split! if( indelPile.size() != 0 ) fullExtendedEventPileup.put(sampleName,new UnifiedReadBackedExtendedEventPileup(loc,indelPile,size,maxDeletionLength,nDeletions,nInsertions,nMQ0Reads)); } } @@ -538,15 +539,15 @@ public class DownsamplingLocusIteratorByState extends LocusIterator { private final PeekableIterator iterator; private final DownsamplingMethod downsamplingMethod; - private final ReadSelector firstReadSelector; + private final ReadSelector chainedReadSelector; private final SamplePartitioner samplePartitioner; - private final Map> readStatesBySample = new HashMap>(); + private final Map readStatesBySample = new HashMap(); private final int targetCoverage; private final int maxReadsAtLocus; - private int totalReadStatesInHanger = 0; + private int totalReadStates = 0; /** * Store a random number generator with a consistent seed for consistent downsampling from run to run. @@ -574,7 +575,7 @@ public class DownsamplingLocusIteratorByState extends LocusIterator { samplePartitioner = new SamplePartitioner(sampleNames); for(String sampleName: sampleNames) - readStatesBySample.put(sampleName,new LinkedList()); + readStatesBySample.put(sampleName,new PerSampleReadStateManager()); ReadSelector primaryReadSelector; if(downsamplingMethod.type == DownsampleType.EXPERIMENTAL_NAIVE_DUPLICATE_ELIMINATOR) { @@ -583,7 +584,7 @@ public class DownsamplingLocusIteratorByState extends LocusIterator { else primaryReadSelector = samplePartitioner; - firstReadSelector = maxReadsAtLocus!=Integer.MAX_VALUE ? new FirstNReadSelector(primaryReadSelector,maxReadsAtLocus) : primaryReadSelector; + chainedReadSelector = maxReadsAtLocus!=Integer.MAX_VALUE ? new FirstNReadSelector(primaryReadSelector,maxReadsAtLocus) : primaryReadSelector; } public Iterator iteratorForSample(final String sampleName) { @@ -600,26 +601,22 @@ public class DownsamplingLocusIteratorByState extends LocusIterator { public void remove() { wrappedIterator.remove(); - totalReadStatesInHanger--; + totalReadStates--; } }; } public boolean isEmpty() { - return totalReadStatesInHanger == 0; + return totalReadStates == 0; } public int size() { - int size = 0; - for(Deque readStates: readStatesBySample.values()) { - size += readStates.size(); - } - return size; + return totalReadStates; } public SAMRecordState getFirst() { for(String sampleName: sampleNames) { - Deque reads = readStatesBySample.get(sampleName); + PerSampleReadStateManager reads = readStatesBySample.get(sampleName); if(!reads.isEmpty()) return reads.peek(); } @@ -627,7 +624,7 @@ public class DownsamplingLocusIteratorByState extends LocusIterator { } public boolean hasNext() { - return totalReadStatesInHanger > 0 || iterator.hasNext(); + return totalReadStates > 0 || iterator.hasNext(); } public void collectPendingReads() { @@ -638,7 +635,7 @@ public class DownsamplingLocusIteratorByState extends LocusIterator { int firstContigIndex = iterator.peek().getReferenceIndex(); int firstAlignmentStart = iterator.peek().getAlignmentStart(); while(iterator.hasNext() && iterator.peek().getReferenceIndex() == firstContigIndex && iterator.peek().getAlignmentStart() == firstAlignmentStart) { - firstReadSelector.submitRead(iterator.next()); + chainedReadSelector.submitRead(iterator.next()); } } else { @@ -647,59 +644,67 @@ public class DownsamplingLocusIteratorByState extends LocusIterator { return; while (iterator.hasNext() && !readIsPastCurrentPosition(iterator.peek())) { - firstReadSelector.submitRead(iterator.next()); + chainedReadSelector.submitRead(iterator.next()); } } - firstReadSelector.complete(); - - int readStatesInHangerEntry = 0; + chainedReadSelector.complete(); for(String sampleName: sampleNames) { ReadSelector aggregator = samplePartitioner.getSelectedReads(sampleName); Collection newReads = new ArrayList(aggregator.getSelectedReads()); - Deque hanger = readStatesBySample.get(sampleName); - int readsInHanger = hanger.size(); + PerSampleReadStateManager statesBySample = readStatesBySample.get(sampleName); + int numReads = statesBySample.size(); - if(readsInHanger+newReads.size()<=targetCoverage || downsamplingMethod.type==DownsampleType.NONE || downsamplingMethod.type==DownsampleType.EXPERIMENTAL_NAIVE_DUPLICATE_ELIMINATOR) { + if(numReads+newReads.size()<=targetCoverage || downsamplingMethod.type==DownsampleType.NONE || downsamplingMethod.type==DownsampleType.EXPERIMENTAL_NAIVE_DUPLICATE_ELIMINATOR) { int readLimit = newReads.size(); boolean mrlViolation = false; - if(readLimit > maxReadsAtLocus-totalReadStatesInHanger) { - readLimit = maxReadsAtLocus-totalReadStatesInHanger; + if(readLimit > maxReadsAtLocus-totalReadStates) { + readLimit = maxReadsAtLocus-totalReadStates; mrlViolation = true; } - readStatesInHangerEntry += addReadsToSample(hanger,newReads,readLimit,mrlViolation); + totalReadStates += addReadsToSample(statesBySample,newReads,readLimit,mrlViolation); } else { - // TODO: implement downsampling mechanism - /* - Iterator> backIterator = hanger.descendingIterator(); + int[] counts = statesBySample.getCountsPerAlignmentStart(); + int[] updatedCounts = new int[counts.length]; + System.arraycopy(counts,0,updatedCounts,0,counts.length); + boolean readPruned = true; - while(readsInHanger+newReads.size()>targetCoverage && readPruned) { + while(numReads+newReads.size()>targetCoverage && readPruned) { readPruned = false; - while(readsInHanger+newReads.size()>targetCoverage && backIterator.hasNext()) { - List readsAtLocus = backIterator.next(); - if(readsAtLocus.size() > 1) { - readsAtLocus.remove(downsampleRandomizer.nextInt(readsAtLocus.size())); + for(int alignmentStart=updatedCounts.length-1;numReads+newReads.size()>targetCoverage&&alignmentStart>=0;alignmentStart--) { + if(updatedCounts[alignmentStart] > 1) { + updatedCounts[alignmentStart]--; + numReads--; readPruned = true; - readsInHanger--; } } } - if(readsInHanger == targetCoverage) { - List readsInFirstHanger = hanger.remove(); - readsInHanger -= readsInFirstHanger.size(); + if(numReads == targetCoverage) { + updatedCounts[0]--; + numReads--; } - readStatesInHangerEntry += addReadsToSample(hanger,newReads,targetCoverage-readsInHanger,false); - */ - } + BitSet toPurge = new BitSet(readStates.size()); + int readOffset = 0; - totalReadStatesInHanger += readStatesInHangerEntry; + for(int i = 0; i < updatedCounts.length; i++) { + int n = counts[i]; + int k = updatedCounts[i]; + + for(Integer purgedElement: MathUtils.sampleIndicesWithoutReplacement(n,n-k)) + toPurge.set(readOffset+purgedElement); + + readOffset += counts[i]; + } + statesBySample.purge(toPurge); + totalReadStates += addReadsToSample(statesBySample,newReads,targetCoverage-numReads,false); + } } - firstReadSelector.reset(); + chainedReadSelector.reset(); } /** @@ -709,7 +714,7 @@ public class DownsamplingLocusIteratorByState extends LocusIterator { * @param maxReads Maximum number of reads to add. * @return Total number of reads added. */ - private int addReadsToSample(final Deque readStates, final Collection reads, final int maxReads, boolean atMaxReadsAtLocusLimit) { + private int addReadsToSample(final PerSampleReadStateManager readStates, final Collection reads, final int maxReads, boolean atMaxReadsAtLocusLimit) { if(reads.isEmpty()) return 0; @@ -718,12 +723,13 @@ public class DownsamplingLocusIteratorByState extends LocusIterator { // the farthest right a read extends Integer rightMostEnd = -1; + Collection newReadStates = new LinkedList(); int readCount = 0; for(SAMRecord read: reads) { - if(readCount <= maxReads) { + if(readCount < maxReads) { SAMRecordState state = new SAMRecordState(read, readInfo.generateExtendedEvents()); state.stepForwardOnGenome(); - readStates.add(state); + newReadStates.add(state); // TODO: What if we downsample the extended events away? if (state.hadIndel()) hasExtendedEvents = true; readCount++; @@ -734,6 +740,7 @@ public class DownsamplingLocusIteratorByState extends LocusIterator { rightMostEnd = (read.getAlignmentEnd() > rightMostEnd) ? read.getAlignmentEnd() : rightMostEnd; } } + readStates.addStatesAtNextAlignmentStart(newReadStates); if (location != null) overflowTracker.exceeded(GenomeLocParser.createGenomeLoc(location.getContigIndex(),location.getStart(),rightMostEnd), @@ -741,11 +748,125 @@ public class DownsamplingLocusIteratorByState extends LocusIterator { return readCount; } + + private class PerSampleReadStateManager implements Iterable { + private final Queue readStates = new LinkedList(); + private final Deque readStateCounter = new LinkedList(); + + public void addStatesAtNextAlignmentStart(Collection states) { + readStates.addAll(states); + readStateCounter.add(new Counter(states.size())); + } + + public boolean isEmpty() { + return readStates.isEmpty(); + } + + public SAMRecordState peek() { + return readStates.peek(); + } + + public int size() { + return readStates.size(); + } + + public int[] getCountsPerAlignmentStart() { + int[] counts = new int[readStateCounter.size()]; + int index = 0; + for(Counter counter: readStateCounter) + counts[index++] = counter.getCount(); + return counts; + } + + public Iterator iterator() { + return new Iterator() { + private Iterator wrappedIterator = readStates.iterator(); + + public boolean hasNext() { + return wrappedIterator.hasNext(); + } + + public SAMRecordState next() { + return wrappedIterator.next(); + } + + public void remove() { + wrappedIterator.remove(); + Counter counter = readStateCounter.peek(); + counter.decrement(); + if(counter.getCount() == 0) + readStateCounter.remove(); + } + }; + } + + /** + * Purge the given elements from the bitset. If an element in the bitset is true, purge + * the corresponding read state. + * @param elements bits from the set to purge. + */ + public void purge(final BitSet elements) { + if(elements.isEmpty() || readStates.isEmpty()) return; + + Iterator readStateIterator = readStates.iterator(); + + Iterator counterIterator = readStateCounter.iterator(); + Counter currentCounter = counterIterator.next(); + + int readIndex = 0; + long alignmentStartCounter = currentCounter.getCount(); + + int toPurge = elements.nextSetBit(0); + int removedCount = 0; + + while(readStateIterator.hasNext() && toPurge >= 0) { + readStateIterator.next(); + + if(readIndex == toPurge) { + readStateIterator.remove(); + currentCounter.decrement(); + if(currentCounter.getCount() == 0) + counterIterator.remove(); + removedCount++; + toPurge = elements.nextSetBit(toPurge+1); + } + + readIndex++; + alignmentStartCounter--; + if(alignmentStartCounter == 0 && counterIterator.hasNext()) { + currentCounter = counterIterator.next(); + alignmentStartCounter = currentCounter.getCount(); + } + } + + totalReadStates -= removedCount; + } + } + } + + /** + * Note: assuming that, whenever we downsample, we downsample to an integer capacity. + */ + private class Counter { + private int count; + + public Counter(int count) { + this.count = count; + } + + public int getCount() { + return count; + } + + public void decrement() { + count--; + } } } /** * Selects reads passed to it based on a criteria decided through inheritance. + * TODO: This is a temporary abstraction until we can get rid of this downsampling implementation and the mrl option. Get rid of this. */ interface ReadSelector { /** @@ -984,7 +1105,7 @@ class SamplePartitioner implements ReadSelector { readsSeen = 0; } } - } +