diff --git a/java/src/org/broadinstitute/sting/gatk/Reads.java b/java/src/org/broadinstitute/sting/gatk/Reads.java index 0a4a33db8..c1b30a860 100755 --- a/java/src/org/broadinstitute/sting/gatk/Reads.java +++ b/java/src/org/broadinstitute/sting/gatk/Reads.java @@ -32,7 +32,7 @@ public class Reads { private Integer downsampleToCoverage = null; private ValidationExclusion exclusionList = null; private Collection supplementalFilters = null; - private int maximumReadsAtLocus = Integer.MAX_VALUE; // this should always be set, so we'll default it MAX_INT + protected int maximumReadsAtLocus = Integer.MAX_VALUE; // this should always be set, so we'll default it MAX_INT private boolean includeReadsWithDeletionAtLoci = false; private boolean generateExtendedEvents = false; // do we want to generate additional piles of "extended" events (indels) // immediately after the reference base such event is associated with? diff --git a/java/src/org/broadinstitute/sting/gatk/iterators/LocusIteratorByState.java b/java/src/org/broadinstitute/sting/gatk/iterators/LocusIteratorByState.java index 8c432f371..a14909fe7 100755 --- a/java/src/org/broadinstitute/sting/gatk/iterators/LocusIteratorByState.java +++ b/java/src/org/broadinstitute/sting/gatk/iterators/LocusIteratorByState.java @@ -37,14 +37,16 @@ import org.broadinstitute.sting.utils.pileup.ReadBackedExtendedEventPileup; import java.util.*; -/** - * Iterator that traverses a SAM File, accumulating information on a per-locus basis - */ +/** Iterator that traverses a SAM File, accumulating information on a per-locus basis */ public class LocusIteratorByState extends LocusIterator { /** - * our log, which we want to capture anything from this class + * the overflow tracker, which makes sure we get a limited number of warnings for locus pile-ups that + * exceed the max depth */ + private LocusOverflowTracker overflowTracker; + + /** our log, which we want to capture anything from this class */ private static Logger logger = Logger.getLogger(LocusIteratorByState.class); // ----------------------------------------------------------------------------------------------------------------- @@ -77,16 +79,16 @@ public class LocusIteratorByState extends LocusIterator { // events immediately preceding the current reference base). boolean generateExtendedEvents = true; // should we generate an additional, special pile for indels between the ref bases? - // the only purpose of this flag is to shield away a few additional lines of code - // when extended piles are not needed, it may not be even worth it... + // the only purpose of this flag is to shield away a few additional lines of code + // when extended piles are not needed, it may not be even worth it... byte[] insertedBases = null; // remember full inserted sequence if we are generating piles of extended events (indels) int eventLength = -1; // will be set to the length of insertion/deletion if we are generating piles of extended events byte eventDelayedFlag = 0; // will be set to non-0 if there was an event (indel) right before the - // current base on the ref. We use a counter-like variable here since clearing the indel event is - // delayed by one base, so we need to remember how long ago we have seen the actual event + // current base on the ref. We use a counter-like variable here since clearing the indel event is + // delayed by one base, so we need to remember how long ago we have seen the actual event int eventStart = -1; // where on the read the extended event starts (i.e. the last position on the read prior to the - // event, or -1 if alignment starts with an insertion); this one is easy to recompute on the fly, - // we cache it here mainly for convenience + // event, or -1 if alignment starts with an insertion); this one is easy to recompute on the fly, + // we cache it here mainly for convenience public SAMRecordState(SAMRecord read, boolean extended) { @@ -257,6 +259,7 @@ public class LocusIteratorByState extends LocusIterator { public LocusIteratorByState(final Iterator samIterator, Reads readInformation) { this.it = new PushbackIterator(samIterator); this.readInfo = readInformation; + overflowTracker = new LocusOverflowTracker(readInformation.getMaxReadsAtLocus()); } public Iterator iterator() { @@ -270,6 +273,9 @@ public class LocusIteratorByState extends LocusIterator { public boolean hasNext() { boolean r = ! readStates.isEmpty() || it.hasNext(); //if ( DEBUG ) System.out.printf("hasNext() = %b%n", r); + + // if we don't have a next record, make sure we clean the warning queue + if (!r) overflowTracker.cleanWarningQueue(); return r; } @@ -437,39 +443,30 @@ public class LocusIteratorByState extends LocusIterator { // } // } - private void collectPendingReads(final int maximumPileupSize) { + private void collectPendingReads(int maxReads) { //if (DEBUG) { // logger.debug(String.format("entering collectPendingReads..., hasNext=%b", it.hasNext())); // printState(); //} - Boolean warned = false; // warn them once per locus int curSize = readStates.size(); // simple performance improvement -- avoids unnecessary size() operation while (it.hasNext()) { SAMRecord read = it.next(); - -// if (DEBUG) logger.debug(String.format("Considering read %s: %s vs. %s", -// read.getReadName(), getLocation(), GenomeLocParser.createGenomeLoc(read))); - - if ( readIsPastCurrentPosition(read) ) { - // We've collected up enough reads + if (readIsPastCurrentPosition(read)) { + // We've collected up all the reads that span this region it.pushback(read); break; } else { - if ( curSize >= maximumPileupSize ) { - if (!warned) { - warned = true; - Utils.warnUser("Unable to add a read, we're over the hanger limit of " + maximumPileupSize + " at location " + getLocation()); - } - } else { + if (curSize < maxReads) { SAMRecordState state = new SAMRecordState(read, readInfo.generateExtendedEvents()); state.stepForwardOnGenome(); readStates.add(state); curSize++; - if ( state.hadIndel() ) hasExtendedEvents = true; + if (state.hadIndel()) hasExtendedEvents = true; //if (DEBUG) logger.debug(String.format(" ... added read %s", read.getReadName())); } } + overflowTracker.exceeded(read, curSize); } } @@ -513,4 +510,99 @@ public class LocusIteratorByState extends LocusIterator { public void remove() { throw new UnsupportedOperationException("Can not remove records from a SAM file via an iterator!"); } + + /** + * a method for setting the overflow tracker, for dependency injection + * @param tracker + */ + protected void setLocusOverflowTracker(LocusOverflowTracker tracker) { + this.overflowTracker = tracker; + } + + /** + * a method for getting the overflow tracker, for testing + * @return + */ + protected LocusOverflowTracker getLocusOverflowTracker() { + return this.overflowTracker; + } +} + +/** + * a helper class that organizes the output of warning messages from read pile-ups that + * are greater than the max pile-up size. We only want a single warning from each non-contigous + * interval, up until the maximum warning limit. + */ +class LocusOverflowTracker { + // the last location we found a warning at + protected GenomeLoc lastLocation = null; + + // the maximum warning count, and the number of warnings emitted + protected static int warningsEmitted = 0; + public static final int MAX_WARNINGS = 100; + + // our maximum pileup size + protected final int maxPileupSize; + + // do we have a pending warning? + protected boolean warningInQueue = false; + + /** + * create a LocusOverflowTracker + * + * @param maxPileup the maximum allowed pile-up size + */ + public LocusOverflowTracker(int maxPileup) { + warningInQueue = false; + maxPileupSize = maxPileup; + } + + /** + * have we exceeded the maximum pile-up size? + * + * @param read the current read, we use this to get the GenomeLoc only if needed (for efficiency) + * @param pileupSize the pile-up size + * + * @return return true if we're greater, false if we're not + */ + public boolean exceeded(SAMRecord read, int pileupSize) { + boolean exceeded = pileupSize >= maxPileupSize; + if (exceeded) { + warningInQueue = true; + GenomeLoc loc = GenomeLocParser.createGenomeLoc(read); + if (lastLocation == null) lastLocation = loc; + else if (lastLocation.contiguousP(loc)) + lastLocation = lastLocation.merge(loc); + else { + warnUser(); + lastLocation = loc; + } + } else if (warningInQueue) { + lastLocation = null; + warnUser(); + } + return exceeded; + } + + /** + * clean up the warning queue, making sure we haven't stored a warning + * that hasn't been emitted yet. + */ + public void cleanWarningQueue() { + if (warningInQueue) warnUser(); + } + + /** warn the user, checking to make sure we haven't exceeded the maximum warning level. */ + protected void warnUser() { + if (!warningInQueue) throw new IllegalStateException("Without a warning in the queue, we shouldn't see a call to warnUser()"); + warningInQueue = false; + if (warningsEmitted < MAX_WARNINGS) { + warningsEmitted++; + Utils.warnUser("Unable to add a read, we're over the hanger limit of " + maxPileupSize + " at location " + lastLocation); + } else if (warningsEmitted == MAX_WARNINGS) { + warningsEmitted++; + Utils.warnUser("Unable to add a read, we're over the hanger limit of " + maxPileupSize + " at location " + lastLocation + + "; the maximum warning count has been reached, we will no longer emit warnings of this nature!!"); + } + } } \ No newline at end of file diff --git a/java/test/org/broadinstitute/sting/gatk/iterators/LocusIteratorByStateTest.java b/java/test/org/broadinstitute/sting/gatk/iterators/LocusIteratorByStateTest.java new file mode 100644 index 000000000..cd56ef619 --- /dev/null +++ b/java/test/org/broadinstitute/sting/gatk/iterators/LocusIteratorByStateTest.java @@ -0,0 +1,104 @@ +package org.broadinstitute.sting.gatk.iterators; + +import junit.framework.Assert; +import net.sf.picard.filter.SamRecordFilter; +import net.sf.samtools.SAMFileHeader; +import net.sf.samtools.SAMFileReader; +import net.sf.samtools.SAMRecord; +import org.broadinstitute.sting.BaseTest; +import org.broadinstitute.sting.gatk.Reads; +import org.broadinstitute.sting.gatk.arguments.ValidationExclusion; +import org.broadinstitute.sting.gatk.contexts.AlignmentContext; +import org.broadinstitute.sting.utils.GenomeLocParser; +import org.broadinstitute.sting.utils.sam.ArtificialSAMUtils; +import org.junit.Before; +import org.junit.BeforeClass; +import org.junit.Test; + +import java.io.File; +import java.util.ArrayList; +import java.util.List; + +/** + * testing of the LocusIteratorByState + */ +public class LocusIteratorByStateTest extends BaseTest { + + private final int MAX_READS = 10; + private static SAMFileHeader header; + private LocusIteratorByState li; + + @BeforeClass + public static void beforeClass() { + header = ArtificialSAMUtils.createArtificialSamHeader(1, 1, 1000); + GenomeLocParser.setupRefContigOrdering(header.getSequenceDictionary()); + } + + + @Test + public void testBasicWarnings() { + // create a bunch of fake records + List records = new ArrayList(); + for (int x = 1; x < 50; x++) + records.add(ArtificialSAMUtils.createArtificialRead(header, "readUno", 0, x, 20)); + + // create a test version of the Reads object + TestReads reads = new TestReads(new ArrayList()); + reads.setMaxPileupSize(MAX_READS); + + // create the iterator by state with the fake reads and fake records + li = new LocusIteratorByState(records.iterator(), reads); + + // inject the testing version of the locus iterator watcher + li.setLocusOverflowTracker(new LocusIteratorOverride(MAX_READS)); + ((LocusIteratorOverride)li.getLocusOverflowTracker()).resetWarningCount(); + while (li.hasNext()) { + AlignmentContext context = li.next(); + //System.err.println(context.getLocation() + " " + context.getPileup().size()); + } + Assert.assertEquals(1, ((LocusIteratorOverride) li.getLocusOverflowTracker()).getWarningCount()); + } + + @Test + public void testTwoBigPiles() { + // create a bunch of fake records + List records = new ArrayList(); + for (int x = 1; x < 50; x++) + records.add(ArtificialSAMUtils.createArtificialRead(header, "readUno", 0, 1, 20)); + for (int x = 1; x < 50; x++) + records.add(ArtificialSAMUtils.createArtificialRead(header, "readUno", 0, 100, 20)); + + // create a test version of the Reads object + TestReads reads = new TestReads(new ArrayList()); + reads.setMaxPileupSize(MAX_READS); + + // create the iterator by state with the fake reads and fake records + li = new LocusIteratorByState(records.iterator(), reads); + + // inject the testing version of the locus iterator watcher + li.setLocusOverflowTracker(new LocusIteratorOverride(MAX_READS)); + ((LocusIteratorOverride)li.getLocusOverflowTracker()).resetWarningCount(); + while (li.hasNext()) { + AlignmentContext context = li.next(); + //System.err.println(context.getLocation() + " " + context.getPileup().size()); + } + Assert.assertEquals(2, ((LocusIteratorOverride) li.getLocusOverflowTracker()).getWarningCount()); + } +} + + +class TestReads extends Reads { + + /** + * Simple constructor for unit testing. + * + * @param readsFiles List of reads files to open. + */ + public TestReads(List readsFiles) { + super(readsFiles); + } + + public void setMaxPileupSize(int maxSize) { + this.maximumReadsAtLocus = maxSize; + } +} \ No newline at end of file diff --git a/java/test/org/broadinstitute/sting/gatk/iterators/LocusOverflowTrackerTest.java b/java/test/org/broadinstitute/sting/gatk/iterators/LocusOverflowTrackerTest.java new file mode 100644 index 000000000..b6cc32c01 --- /dev/null +++ b/java/test/org/broadinstitute/sting/gatk/iterators/LocusOverflowTrackerTest.java @@ -0,0 +1,112 @@ +package org.broadinstitute.sting.gatk.iterators; + +import net.sf.samtools.SAMFileHeader; +import net.sf.samtools.SAMRecord; +import org.broadinstitute.sting.BaseTest; +import org.broadinstitute.sting.utils.GenomeLocParser; +import org.broadinstitute.sting.utils.Utils; +import org.broadinstitute.sting.utils.sam.ArtificialSAMUtils; +import org.junit.Assert; +import org.junit.Before; +import org.junit.BeforeClass; +import org.junit.Test; + + +/** + * @author aaron + *

+ * Class LocusOverflowTrackerTest + *

+ * test out the locus overflow tracker + */ +public class LocusOverflowTrackerTest extends BaseTest { + + private LocusOverflowTracker tracker; + private final int MAX_READS = 10; + private static SAMFileHeader header; + + @BeforeClass + public static void beforeClass() { + header = ArtificialSAMUtils.createArtificialSamHeader(1, 1, 1000); + GenomeLocParser.setupRefContigOrdering(header.getSequenceDictionary()); + } + + @Before + public void beforeTest() { + tracker = new LocusIteratorOverride(MAX_READS); + ((LocusIteratorOverride) tracker).resetWarningCount(); + } + + @Test + public void testLocusOverflow() { + SAMRecord rec = ArtificialSAMUtils.createArtificialRead(header, "readUno", 0, 1, 100); + if (tracker.exceeded(rec, MAX_READS - 1)) + Assert.fail("We shouldn't be exceeded when MAX_READS -1 is the input"); + if (!tracker.exceeded(rec, MAX_READS)) Assert.fail("We should be exceeded when MAX_READS is the input"); + if (!tracker.exceeded(rec, MAX_READS + 1)) + Assert.fail("We shouldn't be exceeded when MAX_READS +1 is the input"); + } + + @Test + public void testContinuousLocus() { + for (int x = 1; x < 5; x++) { + SAMRecord rec = ArtificialSAMUtils.createArtificialRead(header, "readUno", 0, x, 100); + tracker.exceeded(rec, MAX_READS + 1); + } + tracker.cleanWarningQueue(); + Assert.assertEquals(1, ((LocusIteratorOverride) tracker).getWarningCount()); + } + + @Test + public void testTwoSeperateContinuousLoci() { + for (int x = 1; x < 5; x++) { + SAMRecord rec = ArtificialSAMUtils.createArtificialRead(header, "readUno", 0, x, 2); + tracker.exceeded(rec, MAX_READS + 1); + } + for (int x = 10; x < 15; x++) { + SAMRecord rec = ArtificialSAMUtils.createArtificialRead(header, "readUno", 0, x, 2); + tracker.exceeded(rec, MAX_READS + 1); + } + tracker.cleanWarningQueue(); + Assert.assertEquals(2, ((LocusIteratorOverride) tracker).getWarningCount()); + } + + @Test + // make sure we get only the specified number of warnings + public void testOverflow() { + for (int x = 1; x < (LocusOverflowTracker.warningsEmitted * 3); x += 2) { + SAMRecord rec = ArtificialSAMUtils.createArtificialRead(header, "readUno", 0, x, 100); + tracker.exceeded(rec, MAX_READS + 1); + } + tracker.cleanWarningQueue(); + Assert.assertEquals(LocusOverflowTracker.warningsEmitted, ((LocusIteratorOverride) tracker).getWarningCount()); + } + +} + + +class LocusIteratorOverride extends LocusOverflowTracker { + + public int warningsDropped = 0; + + public LocusIteratorOverride(int maxPileup) { + super(maxPileup); + } + + /** override to count warnings instead of generating output. */ + protected void warnUser() { + warningInQueue = false; + if (warningsEmitted <= MAX_WARNINGS) + warningsEmitted++; + else + warningsDropped++; + } + + public int getWarningCount() { + return warningsEmitted; + } + + public void resetWarningCount() { + LocusOverflowTracker.warningsEmitted = 0; + } +} \ No newline at end of file