From 5c2799554aca87f3a5a0d95c609baff574f5e261 Mon Sep 17 00:00:00 2001 From: Mark DePristo Date: Sun, 13 Jan 2013 12:23:51 -0500 Subject: [PATCH] Refactor updateReadStates into PerSampleReadStateManager, add tracking of downsampling rate --- .../utils/locusiterator/LIBSPerformance.java | 4 +- .../utils/locusiterator/ReadStateManager.java | 70 ++++++++++++++----- 2 files changed, 55 insertions(+), 19 deletions(-) diff --git a/public/java/src/org/broadinstitute/sting/utils/locusiterator/LIBSPerformance.java b/public/java/src/org/broadinstitute/sting/utils/locusiterator/LIBSPerformance.java index 0985ed196..2d074f420 100644 --- a/public/java/src/org/broadinstitute/sting/utils/locusiterator/LIBSPerformance.java +++ b/public/java/src/org/broadinstitute/sting/utils/locusiterator/LIBSPerformance.java @@ -63,6 +63,8 @@ public class LIBSPerformance extends CommandLineProgram { @Argument(fullName = "L", shortName = "L", doc = "Query location", required = false) public String location = null; + @Argument(fullName = "dt", shortName = "dt", doc = "Enable downsampling", required = false) + public boolean downsample = false; @Override public int execute() throws IOException { @@ -86,7 +88,7 @@ public class LIBSPerformance extends CommandLineProgram { for ( final SAMReadGroupRecord rg : reader.getFileHeader().getReadGroups() ) samples.add(rg.getSample()); - final LIBSDownsamplingInfo ds = new LIBSDownsamplingInfo(false, -1); + final LIBSDownsamplingInfo ds = new LIBSDownsamplingInfo(downsample, 250); final LocusIteratorByState libs = new LocusIteratorByState( diff --git a/public/java/src/org/broadinstitute/sting/utils/locusiterator/ReadStateManager.java b/public/java/src/org/broadinstitute/sting/utils/locusiterator/ReadStateManager.java index b5dbe2ddb..3276291ef 100644 --- a/public/java/src/org/broadinstitute/sting/utils/locusiterator/ReadStateManager.java +++ b/public/java/src/org/broadinstitute/sting/utils/locusiterator/ReadStateManager.java @@ -29,6 +29,7 @@ import com.google.java.contract.Ensures; import com.google.java.contract.Requires; import net.sf.picard.util.PeekableIterator; import net.sf.samtools.CigarOperator; +import org.apache.log4j.Logger; import org.broadinstitute.sting.gatk.downsampling.Downsampler; import org.broadinstitute.sting.gatk.downsampling.LevelingDownsampler; import org.broadinstitute.sting.utils.sam.GATKSAMRecord; @@ -50,6 +51,8 @@ import java.util.*; * Time: 2:02 PM */ final class ReadStateManager implements Iterable> { + private final static Logger logger = Logger.getLogger(ReadStateManager.class); + private final static boolean CAPTURE_DOWNSAMPLING_STATS = true; private final List samples; private final PeekableIterator iterator; private final SamplePartitioner samplePartitioner; @@ -138,18 +141,8 @@ final class ReadStateManager implements Iterable it = readStateManager.iterator(); - while (it.hasNext()) { - final AlignmentStateMachine state = it.next(); - final CigarOperator op = state.stepForwardOnGenome(); - if (op == null) { - // we discard the read only when we are past its end AND indel at the end of the read (if any) was - // already processed. Keeping the read state that returned null upon stepForwardOnGenome() is safe - // as the next call to stepForwardOnGenome() will return null again AND will clear hadIndel() flag. - it.remove(); // we've stepped off the end of the object - } - } + for (final PerSampleReadStateManager perSampleReadStateManager : readStatesBySample.values() ) { + perSampleReadStateManager.updateReadStates(); } } @@ -301,13 +294,17 @@ final class ReadStateManager implements Iterable { + protected final class PerSampleReadStateManager implements Iterable { private List> readStatesByAlignmentStart = new LinkedList>(); private final Downsampler> levelingDownsampler; - private int thisSampleReadStates = 0; + private final int downsamplingTarget; + private int nSitesNeedingDownsampling = 0; + private int nSites = 0; + public PerSampleReadStateManager(final LIBSDownsamplingInfo LIBSDownsamplingInfo) { + this.downsamplingTarget = LIBSDownsamplingInfo.isPerformDownsampling() ? LIBSDownsamplingInfo.getToCoverage() : -1; this.levelingDownsampler = LIBSDownsamplingInfo.isPerformDownsampling() ? new LevelingDownsampler, AlignmentStateMachine>(LIBSDownsamplingInfo.getToCoverage()) : null; @@ -326,7 +323,8 @@ final class ReadStateManager implements Iterable downsamplingTarget; + if ( downsampling ) { + nSitesNeedingDownsampling++; + message = "Downsampling"; + } + + if ( downsampling || nSites % 10000 == 0 ) + logger.info(String.format("%20s at %s: coverage=%d, max=%d, fraction of downsampled sites=%.2e", + message, loc, thisSampleReadStates, downsamplingTarget, (1.0 * nSitesNeedingDownsampling / nSites))); + } + } + public boolean isEmpty() { return readStatesByAlignmentStart.isEmpty(); } @@ -351,11 +371,25 @@ final class ReadStateManager implements Iterable it = iterator(); + while (it.hasNext()) { + final AlignmentStateMachine state = it.next(); + final CigarOperator op = state.stepForwardOnGenome(); + if (op == null) { + // we discard the read only when we are past its end AND indel at the end of the read (if any) was + // already processed. Keeping the read state that returned null upon stepForwardOnGenome() is safe + // as the next call to stepForwardOnGenome() will return null again AND will clear hadIndel() flag. + it.remove(); // we've stepped off the end of the object + } + } + } + public Iterator iterator() { return new Iterator() { - private Iterator> alignmentStartIterator = readStatesByAlignmentStart.iterator(); - private LinkedList currentPositionReadStates = null; - private Iterator currentPositionReadStatesIterator = null; + private final Iterator> alignmentStartIterator = readStatesByAlignmentStart.iterator(); + private LinkedList currentPositionReadStates; + private Iterator currentPositionReadStatesIterator; public boolean hasNext() { return alignmentStartIterator.hasNext() ||