From b57d4250bfae1ea8c6cdb5da8fca83e5affa01ea Mon Sep 17 00:00:00 2001 From: Matt Hanna Date: Thu, 9 Feb 2012 11:24:52 -0500 Subject: [PATCH] Documentation request by Eric. At each stage of the GATK where filtering occurs, added documentation suggesting the goal of the filtering along with examples of suggested inputs and outputs. --- .../gatk/datasources/providers/LocusView.java | 9 +- .../IntervalOverlapFilteringIterator.java | 203 ++++++++++++++++++ .../gatk/datasources/reads/SAMDataSource.java | 162 -------------- .../sting/gatk/executive/WindowMaker.java | 18 +- .../sting/gatk/traversals/TraverseLoci.java | 4 +- 5 files changed, 228 insertions(+), 168 deletions(-) create mode 100644 public/java/src/org/broadinstitute/sting/gatk/datasources/reads/IntervalOverlapFilteringIterator.java diff --git a/public/java/src/org/broadinstitute/sting/gatk/datasources/providers/LocusView.java b/public/java/src/org/broadinstitute/sting/gatk/datasources/providers/LocusView.java index f9ed0cb74..a3ce6dd27 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/datasources/providers/LocusView.java +++ b/public/java/src/org/broadinstitute/sting/gatk/datasources/providers/LocusView.java @@ -25,9 +25,14 @@ import java.util.NoSuchElementException; */ /** - * A queue of locus context entries. + * The two goals of the LocusView are as follows: + * 1) To provide a 'trigger track' iteration interface so that TraverseLoci can easily switch + * between iterating over all bases in a region, only covered bases in a region covered by + * reads, only bases in a region covered by RODs, or any other sort of trigger track + * implementation one can think of. + * 2) To manage the copious number of iterators that have to be jointly pulled through the + * genome to make a locus traversal function. */ - public abstract class LocusView extends LocusIterator implements View { /** * The locus bounding this view. diff --git a/public/java/src/org/broadinstitute/sting/gatk/datasources/reads/IntervalOverlapFilteringIterator.java b/public/java/src/org/broadinstitute/sting/gatk/datasources/reads/IntervalOverlapFilteringIterator.java new file mode 100644 index 000000000..4005f1c32 --- /dev/null +++ b/public/java/src/org/broadinstitute/sting/gatk/datasources/reads/IntervalOverlapFilteringIterator.java @@ -0,0 +1,203 @@ +/* + * Copyright (c) 2012, The Broad Institute + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, + * copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following + * conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES + * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT + * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + */ + +package org.broadinstitute.sting.gatk.datasources.reads; + +import net.sf.samtools.SAMRecord; +import net.sf.samtools.util.CloseableIterator; +import org.broadinstitute.sting.utils.GenomeLoc; +import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; + +import java.util.List; +import java.util.NoSuchElementException; + +/** + * High efficiency filtering iterator designed to filter out reads only included + * in the query results due to the granularity of the BAM index. + * + * Built into the BAM index is a notion of 16kbase granularity -- an index query for + * two regions contained within a 16kbase chunk (say, chr1:5-10 and chr1:11-20) will + * return exactly the same regions within the BAM file. This iterator is optimized + * to subtract out reads which do not at all overlap the interval list passed to the + * constructor. + * + * Example: + * interval list: chr20:6-10 + * Reads that would pass through the filter: chr20:6-10, chr20:1-15, chr20:1-7, chr20:8-15. + * Reads that would be discarded by the filter: chr20:1-5, chr20:11-15. + */ +class IntervalOverlapFilteringIterator implements CloseableIterator { + /** + * The wrapped iterator. + */ + private CloseableIterator iterator; + + /** + * The next read, queued up and ready to go. + */ + private SAMRecord nextRead; + + /** + * Rather than using the straight genomic bounds, use filter out only mapped reads. + */ + private boolean keepOnlyUnmappedReads; + + /** + * Custom representation of interval bounds. + * Makes it simpler to track current position. + */ + private int[] intervalContigIndices; + private int[] intervalStarts; + private int[] intervalEnds; + + /** + * Position within the interval list. + */ + private int currentBound = 0; + + public IntervalOverlapFilteringIterator(CloseableIterator iterator, List intervals) { + this.iterator = iterator; + + // Look at the interval list to detect whether we should worry about unmapped reads. + // If we find a mix of mapped/unmapped intervals, throw an exception. + boolean foundMappedIntervals = false; + for(GenomeLoc location: intervals) { + if(! GenomeLoc.isUnmapped(location)) + foundMappedIntervals = true; + keepOnlyUnmappedReads |= GenomeLoc.isUnmapped(location); + } + + + if(foundMappedIntervals) { + if(keepOnlyUnmappedReads) + throw new ReviewedStingException("Tried to apply IntervalOverlapFilteringIterator to a mixed of mapped and unmapped intervals. Please apply this filter to only mapped or only unmapped reads"); + this.intervalContigIndices = new int[intervals.size()]; + this.intervalStarts = new int[intervals.size()]; + this.intervalEnds = new int[intervals.size()]; + int i = 0; + for(GenomeLoc interval: intervals) { + intervalContigIndices[i] = interval.getContigIndex(); + intervalStarts[i] = interval.getStart(); + intervalEnds[i] = interval.getStop(); + i++; + } + } + + advance(); + } + + public boolean hasNext() { + return nextRead != null; + } + + public SAMRecord next() { + if(nextRead == null) + throw new NoSuchElementException("No more reads left in this iterator."); + SAMRecord currentRead = nextRead; + advance(); + return currentRead; + } + + public void remove() { + throw new UnsupportedOperationException("Cannot remove from an IntervalOverlapFilteringIterator"); + } + + + public void close() { + iterator.close(); + } + + private void advance() { + nextRead = null; + + if(!iterator.hasNext()) + return; + + SAMRecord candidateRead = iterator.next(); + while(nextRead == null && (keepOnlyUnmappedReads || currentBound < intervalStarts.length)) { + if(!keepOnlyUnmappedReads) { + // Mapped read filter; check against GenomeLoc-derived bounds. + if(readEndsOnOrAfterStartingBound(candidateRead)) { + // This read ends after the current interval begins. + // Promising, but this read must be checked against the ending bound. + if(readStartsOnOrBeforeEndingBound(candidateRead)) { + // Yes, this read is within both bounds. This must be our next read. + nextRead = candidateRead; + break; + } + else { + // Oops, we're past the end bound. Increment the current bound and try again. + currentBound++; + continue; + } + } + } + else { + // Found an unmapped read. We're done. + if(candidateRead.getReadUnmappedFlag()) { + nextRead = candidateRead; + break; + } + } + + // No more reads available. Stop the search. + if(!iterator.hasNext()) + break; + + // No reasonable read found; advance the iterator. + candidateRead = iterator.next(); + } + } + + /** + * Check whether the read lies after the start of the current bound. If the read is unmapped but placed, its + * end will be distorted, so rely only on the alignment start. + * @param read The read to position-check. + * @return True if the read starts after the current bounds. False otherwise. + */ + private boolean readEndsOnOrAfterStartingBound(final SAMRecord read) { + return + // Read ends on a later contig, or... + read.getReferenceIndex() > intervalContigIndices[currentBound] || + // Read ends of this contig... + (read.getReferenceIndex() == intervalContigIndices[currentBound] && + // either after this location, or... + (read.getAlignmentEnd() >= intervalStarts[currentBound] || + // read is unmapped but positioned and alignment start is on or after this start point. + (read.getReadUnmappedFlag() && read.getAlignmentStart() >= intervalStarts[currentBound]))); + } + + /** + * Check whether the read lies before the end of the current bound. + * @param read The read to position-check. + * @return True if the read starts after the current bounds. False otherwise. + */ + private boolean readStartsOnOrBeforeEndingBound(final SAMRecord read) { + return + // Read starts on a prior contig, or... + read.getReferenceIndex() < intervalContigIndices[currentBound] || + // Read starts on this contig and the alignment start is registered before this end point. + (read.getReferenceIndex() == intervalContigIndices[currentBound] && read.getAlignmentStart() <= intervalEnds[currentBound]); + } +} diff --git a/public/java/src/org/broadinstitute/sting/gatk/datasources/reads/SAMDataSource.java b/public/java/src/org/broadinstitute/sting/gatk/datasources/reads/SAMDataSource.java index a4681cffd..27b9e7f77 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/datasources/reads/SAMDataSource.java +++ b/public/java/src/org/broadinstitute/sting/gatk/datasources/reads/SAMDataSource.java @@ -39,7 +39,6 @@ import org.broadinstitute.sting.gatk.filters.CountingFilteringIterator; import org.broadinstitute.sting.gatk.filters.ReadFilter; import org.broadinstitute.sting.gatk.iterators.*; import org.broadinstitute.sting.gatk.resourcemanagement.ThreadAllocation; -import org.broadinstitute.sting.utils.GenomeLoc; import org.broadinstitute.sting.utils.GenomeLocParser; import org.broadinstitute.sting.utils.GenomeLocSortedSet; import org.broadinstitute.sting.utils.SimpleTimer; @@ -976,167 +975,6 @@ public class SAMDataSource { */ private class ReadGroupMapping extends HashMap {} - /** - * Filters out reads that do not overlap the current GenomeLoc. - * Note the custom implementation: BAM index querying returns all reads that could - * possibly overlap the given region (and quite a few extras). In order not to drag - * down performance, this implementation is highly customized to its task. - */ - private class IntervalOverlapFilteringIterator implements CloseableIterator { - /** - * The wrapped iterator. - */ - private CloseableIterator iterator; - - /** - * The next read, queued up and ready to go. - */ - private SAMRecord nextRead; - - /** - * Rather than using the straight genomic bounds, use filter out only mapped reads. - */ - private boolean keepOnlyUnmappedReads; - - /** - * Custom representation of interval bounds. - * Makes it simpler to track current position. - */ - private int[] intervalContigIndices; - private int[] intervalStarts; - private int[] intervalEnds; - - /** - * Position within the interval list. - */ - private int currentBound = 0; - - public IntervalOverlapFilteringIterator(CloseableIterator iterator, List intervals) { - this.iterator = iterator; - - // Look at the interval list to detect whether we should worry about unmapped reads. - // If we find a mix of mapped/unmapped intervals, throw an exception. - boolean foundMappedIntervals = false; - for(GenomeLoc location: intervals) { - if(! GenomeLoc.isUnmapped(location)) - foundMappedIntervals = true; - keepOnlyUnmappedReads |= GenomeLoc.isUnmapped(location); - } - - - if(foundMappedIntervals) { - if(keepOnlyUnmappedReads) - throw new ReviewedStingException("Tried to apply IntervalOverlapFilteringIterator to a mixed of mapped and unmapped intervals. Please apply this filter to only mapped or only unmapped reads"); - this.intervalContigIndices = new int[intervals.size()]; - this.intervalStarts = new int[intervals.size()]; - this.intervalEnds = new int[intervals.size()]; - int i = 0; - for(GenomeLoc interval: intervals) { - intervalContigIndices[i] = interval.getContigIndex(); - intervalStarts[i] = interval.getStart(); - intervalEnds[i] = interval.getStop(); - i++; - } - } - - advance(); - } - - public boolean hasNext() { - return nextRead != null; - } - - public SAMRecord next() { - if(nextRead == null) - throw new NoSuchElementException("No more reads left in this iterator."); - SAMRecord currentRead = nextRead; - advance(); - return currentRead; - } - - public void remove() { - throw new UnsupportedOperationException("Cannot remove from an IntervalOverlapFilteringIterator"); - } - - - public void close() { - iterator.close(); - } - - private void advance() { - nextRead = null; - - if(!iterator.hasNext()) - return; - - SAMRecord candidateRead = iterator.next(); - while(nextRead == null && (keepOnlyUnmappedReads || currentBound < intervalStarts.length)) { - if(!keepOnlyUnmappedReads) { - // Mapped read filter; check against GenomeLoc-derived bounds. - if(readEndsOnOrAfterStartingBound(candidateRead)) { - // This read ends after the current interval begins. - // Promising, but this read must be checked against the ending bound. - if(readStartsOnOrBeforeEndingBound(candidateRead)) { - // Yes, this read is within both bounds. This must be our next read. - nextRead = candidateRead; - break; - } - else { - // Oops, we're past the end bound. Increment the current bound and try again. - currentBound++; - continue; - } - } - } - else { - // Found an unmapped read. We're done. - if(candidateRead.getReadUnmappedFlag()) { - nextRead = candidateRead; - break; - } - } - - // No more reads available. Stop the search. - if(!iterator.hasNext()) - break; - - // No reasonable read found; advance the iterator. - candidateRead = iterator.next(); - } - } - - /** - * Check whether the read lies after the start of the current bound. If the read is unmapped but placed, its - * end will be distorted, so rely only on the alignment start. - * @param read The read to position-check. - * @return True if the read starts after the current bounds. False otherwise. - */ - private boolean readEndsOnOrAfterStartingBound(final SAMRecord read) { - return - // Read ends on a later contig, or... - read.getReferenceIndex() > intervalContigIndices[currentBound] || - // Read ends of this contig... - (read.getReferenceIndex() == intervalContigIndices[currentBound] && - // either after this location, or... - (read.getAlignmentEnd() >= intervalStarts[currentBound] || - // read is unmapped but positioned and alignment start is on or after this start point. - (read.getReadUnmappedFlag() && read.getAlignmentStart() >= intervalStarts[currentBound]))); - } - - /** - * Check whether the read lies before the end of the current bound. - * @param read The read to position-check. - * @return True if the read starts after the current bounds. False otherwise. - */ - private boolean readStartsOnOrBeforeEndingBound(final SAMRecord read) { - return - // Read starts on a prior contig, or... - read.getReferenceIndex() < intervalContigIndices[currentBound] || - // Read starts on this contig and the alignment start is registered before this end point. - (read.getReferenceIndex() == intervalContigIndices[currentBound] && read.getAlignmentStart() <= intervalEnds[currentBound]); - } - } - /** * Locates the index file alongside the given BAM, if present. * TODO: This is currently a hachetjob that reaches into Picard and pulls out its index file locator. Replace with something more permanent. diff --git a/public/java/src/org/broadinstitute/sting/gatk/executive/WindowMaker.java b/public/java/src/org/broadinstitute/sting/gatk/executive/WindowMaker.java index d1f5d80da..da11d36dd 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/executive/WindowMaker.java +++ b/public/java/src/org/broadinstitute/sting/gatk/executive/WindowMaker.java @@ -17,9 +17,21 @@ import java.util.List; import java.util.NoSuchElementException; /** - * Buffer shards of data which may or may not contain multiple loci into - * iterators of all data which cover an interval. Its existence is an homage - * to Mark's stillborn WindowMaker, RIP 2009. + * Transforms an iterator of reads which overlap the given interval list into an iterator of covered single-base loci + * completely contained within the interval list. To do this, it creates a LocusIteratorByState which will emit a single-bp + * locus for every base covered by the read iterator, then uses the WindowMakerIterator.advance() to filter down that stream of + * loci to only those covered by the given interval list. + * + * Example: + * Incoming stream of reads: A:chr20:1-5, B:chr20:2-6, C:chr20:2-7, D:chr20:3-8, E:chr20:5-10 + * Incoming intervals: chr20:3-7 + * + * Locus iterator by state will produce the following stream of data: + * chr1:1 {A}, chr1:2 {A,B,C}, chr1:3 {A,B,C,D}, chr1:4 {A,B,C,D}, chr1:5 {A,B,C,D,E}, + * chr1:6 {B,C,D,E}, chr1:7 {C,D,E}, chr1:8 {D,E}, chr1:9 {E}, chr1:10 {E} + * + * WindowMakerIterator will then filter the incoming stream, emitting the following stream: + * chr1:3 {A,B,C,D}, chr1:4 {A,B,C,D}, chr1:5 {A,B,C,D,E}, chr1:6 {B,C,D,E}, chr1:7 {C,D,E} * * @author mhanna * @version 0.1 diff --git a/public/java/src/org/broadinstitute/sting/gatk/traversals/TraverseLoci.java b/public/java/src/org/broadinstitute/sting/gatk/traversals/TraverseLoci.java index d99e7c353..1d14a7f35 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/traversals/TraverseLoci.java +++ b/public/java/src/org/broadinstitute/sting/gatk/traversals/TraverseLoci.java @@ -102,7 +102,9 @@ public class TraverseLoci extends TraversalEngine,Locu } /** - * Gets the best view of loci for this walker given the available data. + * Gets the best view of loci for this walker given the available data. The view will function as a 'trigger track' + * of sorts, providing a consistent interface so that TraverseLoci doesn't need to be reimplemented for any new datatype + * that comes along. * @param walker walker to interrogate. * @param dataProvider Data which which to drive the locus view. * @return A view of the locus data, where one iteration of the locus view maps to one iteration of the traversal.