changes to intelligently log overflowing locus pile-ups.

git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@2640 348d0f76-0448-11de-a6fe-93d51630548a
This commit is contained in:
aaron 2010-01-20 08:09:48 +00:00
parent 4ac9eb7cb2
commit a1b4cc4baf
4 changed files with 334 additions and 26 deletions

View File

@ -32,7 +32,7 @@ public class Reads {
private Integer downsampleToCoverage = null;
private ValidationExclusion exclusionList = null;
private Collection<SamRecordFilter> supplementalFilters = null;
private int maximumReadsAtLocus = Integer.MAX_VALUE; // this should always be set, so we'll default it MAX_INT
protected int maximumReadsAtLocus = Integer.MAX_VALUE; // this should always be set, so we'll default it MAX_INT
private boolean includeReadsWithDeletionAtLoci = false;
private boolean generateExtendedEvents = false; // do we want to generate additional piles of "extended" events (indels)
// immediately after the reference base such event is associated with?

View File

@ -37,14 +37,16 @@ import org.broadinstitute.sting.utils.pileup.ReadBackedExtendedEventPileup;
import java.util.*;
/**
* Iterator that traverses a SAM File, accumulating information on a per-locus basis
*/
/** Iterator that traverses a SAM File, accumulating information on a per-locus basis */
public class LocusIteratorByState extends LocusIterator {
/**
* our log, which we want to capture anything from this class
* the overflow tracker, which makes sure we get a limited number of warnings for locus pile-ups that
* exceed the max depth
*/
private LocusOverflowTracker overflowTracker;
/** our log, which we want to capture anything from this class */
private static Logger logger = Logger.getLogger(LocusIteratorByState.class);
// -----------------------------------------------------------------------------------------------------------------
@ -77,16 +79,16 @@ public class LocusIteratorByState extends LocusIterator {
// events immediately preceding the current reference base).
boolean generateExtendedEvents = true; // should we generate an additional, special pile for indels between the ref bases?
// the only purpose of this flag is to shield away a few additional lines of code
// when extended piles are not needed, it may not be even worth it...
// the only purpose of this flag is to shield away a few additional lines of code
// when extended piles are not needed, it may not be even worth it...
byte[] insertedBases = null; // remember full inserted sequence if we are generating piles of extended events (indels)
int eventLength = -1; // will be set to the length of insertion/deletion if we are generating piles of extended events
byte eventDelayedFlag = 0; // will be set to non-0 if there was an event (indel) right before the
// current base on the ref. We use a counter-like variable here since clearing the indel event is
// delayed by one base, so we need to remember how long ago we have seen the actual event
// current base on the ref. We use a counter-like variable here since clearing the indel event is
// delayed by one base, so we need to remember how long ago we have seen the actual event
int eventStart = -1; // where on the read the extended event starts (i.e. the last position on the read prior to the
// event, or -1 if alignment starts with an insertion); this one is easy to recompute on the fly,
// we cache it here mainly for convenience
// event, or -1 if alignment starts with an insertion); this one is easy to recompute on the fly,
// we cache it here mainly for convenience
public SAMRecordState(SAMRecord read, boolean extended) {
@ -257,6 +259,7 @@ public class LocusIteratorByState extends LocusIterator {
public LocusIteratorByState(final Iterator<SAMRecord> samIterator, Reads readInformation) {
this.it = new PushbackIterator<SAMRecord>(samIterator);
this.readInfo = readInformation;
overflowTracker = new LocusOverflowTracker(readInformation.getMaxReadsAtLocus());
}
public Iterator<AlignmentContext> iterator() {
@ -270,6 +273,9 @@ public class LocusIteratorByState extends LocusIterator {
public boolean hasNext() {
boolean r = ! readStates.isEmpty() || it.hasNext();
//if ( DEBUG ) System.out.printf("hasNext() = %b%n", r);
// if we don't have a next record, make sure we clean the warning queue
if (!r) overflowTracker.cleanWarningQueue();
return r;
}
@ -437,39 +443,30 @@ public class LocusIteratorByState extends LocusIterator {
// }
// }
private void collectPendingReads(final int maximumPileupSize) {
private void collectPendingReads(int maxReads) {
//if (DEBUG) {
// logger.debug(String.format("entering collectPendingReads..., hasNext=%b", it.hasNext()));
// printState();
//}
Boolean warned = false; // warn them once per locus
int curSize = readStates.size(); // simple performance improvement -- avoids unnecessary size() operation
while (it.hasNext()) {
SAMRecord read = it.next();
// if (DEBUG) logger.debug(String.format("Considering read %s: %s vs. %s",
// read.getReadName(), getLocation(), GenomeLocParser.createGenomeLoc(read)));
if ( readIsPastCurrentPosition(read) ) {
// We've collected up enough reads
if (readIsPastCurrentPosition(read)) {
// We've collected up all the reads that span this region
it.pushback(read);
break;
} else {
if ( curSize >= maximumPileupSize ) {
if (!warned) {
warned = true;
Utils.warnUser("Unable to add a read, we're over the hanger limit of " + maximumPileupSize + " at location " + getLocation());
}
} else {
if (curSize < maxReads) {
SAMRecordState state = new SAMRecordState(read, readInfo.generateExtendedEvents());
state.stepForwardOnGenome();
readStates.add(state);
curSize++;
if ( state.hadIndel() ) hasExtendedEvents = true;
if (state.hadIndel()) hasExtendedEvents = true;
//if (DEBUG) logger.debug(String.format(" ... added read %s", read.getReadName()));
}
}
overflowTracker.exceeded(read, curSize);
}
}
@ -513,4 +510,99 @@ public class LocusIteratorByState extends LocusIterator {
public void remove() {
throw new UnsupportedOperationException("Can not remove records from a SAM file via an iterator!");
}
/**
* a method for setting the overflow tracker, for dependency injection
* @param tracker
*/
protected void setLocusOverflowTracker(LocusOverflowTracker tracker) {
this.overflowTracker = tracker;
}
/**
* a method for getting the overflow tracker, for testing
* @return
*/
protected LocusOverflowTracker getLocusOverflowTracker() {
return this.overflowTracker;
}
}
/**
* a helper class that organizes the output of warning messages from read pile-ups that
* are greater than the max pile-up size. We only want a single warning from each non-contigous
* interval, up until the maximum warning limit.
*/
class LocusOverflowTracker {
// the last location we found a warning at
protected GenomeLoc lastLocation = null;
// the maximum warning count, and the number of warnings emitted
protected static int warningsEmitted = 0;
public static final int MAX_WARNINGS = 100;
// our maximum pileup size
protected final int maxPileupSize;
// do we have a pending warning?
protected boolean warningInQueue = false;
/**
* create a LocusOverflowTracker
*
* @param maxPileup the maximum allowed pile-up size
*/
public LocusOverflowTracker(int maxPileup) {
warningInQueue = false;
maxPileupSize = maxPileup;
}
/**
* have we exceeded the maximum pile-up size?
*
* @param read the current read, we use this to get the GenomeLoc only if needed (for efficiency)
* @param pileupSize the pile-up size
*
* @return return true if we're greater, false if we're not
*/
public boolean exceeded(SAMRecord read, int pileupSize) {
boolean exceeded = pileupSize >= maxPileupSize;
if (exceeded) {
warningInQueue = true;
GenomeLoc loc = GenomeLocParser.createGenomeLoc(read);
if (lastLocation == null) lastLocation = loc;
else if (lastLocation.contiguousP(loc))
lastLocation = lastLocation.merge(loc);
else {
warnUser();
lastLocation = loc;
}
} else if (warningInQueue) {
lastLocation = null;
warnUser();
}
return exceeded;
}
/**
* clean up the warning queue, making sure we haven't stored a warning
* that hasn't been emitted yet.
*/
public void cleanWarningQueue() {
if (warningInQueue) warnUser();
}
/** warn the user, checking to make sure we haven't exceeded the maximum warning level. */
protected void warnUser() {
if (!warningInQueue) throw new IllegalStateException("Without a warning in the queue, we shouldn't see a call to warnUser()");
warningInQueue = false;
if (warningsEmitted < MAX_WARNINGS) {
warningsEmitted++;
Utils.warnUser("Unable to add a read, we're over the hanger limit of " + maxPileupSize + " at location " + lastLocation);
} else if (warningsEmitted == MAX_WARNINGS) {
warningsEmitted++;
Utils.warnUser("Unable to add a read, we're over the hanger limit of " + maxPileupSize + " at location " + lastLocation +
"; the maximum warning count has been reached, we will no longer emit warnings of this nature!!");
}
}
}

View File

@ -0,0 +1,104 @@
package org.broadinstitute.sting.gatk.iterators;
import junit.framework.Assert;
import net.sf.picard.filter.SamRecordFilter;
import net.sf.samtools.SAMFileHeader;
import net.sf.samtools.SAMFileReader;
import net.sf.samtools.SAMRecord;
import org.broadinstitute.sting.BaseTest;
import org.broadinstitute.sting.gatk.Reads;
import org.broadinstitute.sting.gatk.arguments.ValidationExclusion;
import org.broadinstitute.sting.gatk.contexts.AlignmentContext;
import org.broadinstitute.sting.utils.GenomeLocParser;
import org.broadinstitute.sting.utils.sam.ArtificialSAMUtils;
import org.junit.Before;
import org.junit.BeforeClass;
import org.junit.Test;
import java.io.File;
import java.util.ArrayList;
import java.util.List;
/**
* testing of the LocusIteratorByState
*/
public class LocusIteratorByStateTest extends BaseTest {
private final int MAX_READS = 10;
private static SAMFileHeader header;
private LocusIteratorByState li;
@BeforeClass
public static void beforeClass() {
header = ArtificialSAMUtils.createArtificialSamHeader(1, 1, 1000);
GenomeLocParser.setupRefContigOrdering(header.getSequenceDictionary());
}
@Test
public void testBasicWarnings() {
// create a bunch of fake records
List<SAMRecord> records = new ArrayList<SAMRecord>();
for (int x = 1; x < 50; x++)
records.add(ArtificialSAMUtils.createArtificialRead(header, "readUno", 0, x, 20));
// create a test version of the Reads object
TestReads reads = new TestReads(new ArrayList<File>());
reads.setMaxPileupSize(MAX_READS);
// create the iterator by state with the fake reads and fake records
li = new LocusIteratorByState(records.iterator(), reads);
// inject the testing version of the locus iterator watcher
li.setLocusOverflowTracker(new LocusIteratorOverride(MAX_READS));
((LocusIteratorOverride)li.getLocusOverflowTracker()).resetWarningCount();
while (li.hasNext()) {
AlignmentContext context = li.next();
//System.err.println(context.getLocation() + " " + context.getPileup().size());
}
Assert.assertEquals(1, ((LocusIteratorOverride) li.getLocusOverflowTracker()).getWarningCount());
}
@Test
public void testTwoBigPiles() {
// create a bunch of fake records
List<SAMRecord> records = new ArrayList<SAMRecord>();
for (int x = 1; x < 50; x++)
records.add(ArtificialSAMUtils.createArtificialRead(header, "readUno", 0, 1, 20));
for (int x = 1; x < 50; x++)
records.add(ArtificialSAMUtils.createArtificialRead(header, "readUno", 0, 100, 20));
// create a test version of the Reads object
TestReads reads = new TestReads(new ArrayList<File>());
reads.setMaxPileupSize(MAX_READS);
// create the iterator by state with the fake reads and fake records
li = new LocusIteratorByState(records.iterator(), reads);
// inject the testing version of the locus iterator watcher
li.setLocusOverflowTracker(new LocusIteratorOverride(MAX_READS));
((LocusIteratorOverride)li.getLocusOverflowTracker()).resetWarningCount();
while (li.hasNext()) {
AlignmentContext context = li.next();
//System.err.println(context.getLocation() + " " + context.getPileup().size());
}
Assert.assertEquals(2, ((LocusIteratorOverride) li.getLocusOverflowTracker()).getWarningCount());
}
}
class TestReads extends Reads {
/**
* Simple constructor for unit testing.
*
* @param readsFiles List of reads files to open.
*/
public TestReads(List<File> readsFiles) {
super(readsFiles);
}
public void setMaxPileupSize(int maxSize) {
this.maximumReadsAtLocus = maxSize;
}
}

View File

@ -0,0 +1,112 @@
package org.broadinstitute.sting.gatk.iterators;
import net.sf.samtools.SAMFileHeader;
import net.sf.samtools.SAMRecord;
import org.broadinstitute.sting.BaseTest;
import org.broadinstitute.sting.utils.GenomeLocParser;
import org.broadinstitute.sting.utils.Utils;
import org.broadinstitute.sting.utils.sam.ArtificialSAMUtils;
import org.junit.Assert;
import org.junit.Before;
import org.junit.BeforeClass;
import org.junit.Test;
/**
* @author aaron
* <p/>
* Class LocusOverflowTrackerTest
* <p/>
* test out the locus overflow tracker
*/
public class LocusOverflowTrackerTest extends BaseTest {
private LocusOverflowTracker tracker;
private final int MAX_READS = 10;
private static SAMFileHeader header;
@BeforeClass
public static void beforeClass() {
header = ArtificialSAMUtils.createArtificialSamHeader(1, 1, 1000);
GenomeLocParser.setupRefContigOrdering(header.getSequenceDictionary());
}
@Before
public void beforeTest() {
tracker = new LocusIteratorOverride(MAX_READS);
((LocusIteratorOverride) tracker).resetWarningCount();
}
@Test
public void testLocusOverflow() {
SAMRecord rec = ArtificialSAMUtils.createArtificialRead(header, "readUno", 0, 1, 100);
if (tracker.exceeded(rec, MAX_READS - 1))
Assert.fail("We shouldn't be exceeded when MAX_READS -1 is the input");
if (!tracker.exceeded(rec, MAX_READS)) Assert.fail("We should be exceeded when MAX_READS is the input");
if (!tracker.exceeded(rec, MAX_READS + 1))
Assert.fail("We shouldn't be exceeded when MAX_READS +1 is the input");
}
@Test
public void testContinuousLocus() {
for (int x = 1; x < 5; x++) {
SAMRecord rec = ArtificialSAMUtils.createArtificialRead(header, "readUno", 0, x, 100);
tracker.exceeded(rec, MAX_READS + 1);
}
tracker.cleanWarningQueue();
Assert.assertEquals(1, ((LocusIteratorOverride) tracker).getWarningCount());
}
@Test
public void testTwoSeperateContinuousLoci() {
for (int x = 1; x < 5; x++) {
SAMRecord rec = ArtificialSAMUtils.createArtificialRead(header, "readUno", 0, x, 2);
tracker.exceeded(rec, MAX_READS + 1);
}
for (int x = 10; x < 15; x++) {
SAMRecord rec = ArtificialSAMUtils.createArtificialRead(header, "readUno", 0, x, 2);
tracker.exceeded(rec, MAX_READS + 1);
}
tracker.cleanWarningQueue();
Assert.assertEquals(2, ((LocusIteratorOverride) tracker).getWarningCount());
}
@Test
// make sure we get only the specified number of warnings
public void testOverflow() {
for (int x = 1; x < (LocusOverflowTracker.warningsEmitted * 3); x += 2) {
SAMRecord rec = ArtificialSAMUtils.createArtificialRead(header, "readUno", 0, x, 100);
tracker.exceeded(rec, MAX_READS + 1);
}
tracker.cleanWarningQueue();
Assert.assertEquals(LocusOverflowTracker.warningsEmitted, ((LocusIteratorOverride) tracker).getWarningCount());
}
}
class LocusIteratorOverride extends LocusOverflowTracker {
public int warningsDropped = 0;
public LocusIteratorOverride(int maxPileup) {
super(maxPileup);
}
/** override to count warnings instead of generating output. */
protected void warnUser() {
warningInQueue = false;
if (warningsEmitted <= MAX_WARNINGS)
warningsEmitted++;
else
warningsDropped++;
}
public int getWarningCount() {
return warningsEmitted;
}
public void resetWarningCount() {
LocusOverflowTracker.warningsEmitted = 0;
}
}