From dfc36c1e9535aea7903f8314550793c4e134bec7 Mon Sep 17 00:00:00 2001 From: depristo Date: Wed, 26 May 2010 22:12:25 +0000 Subject: [PATCH] Restructuring of the mandatory read filters for traversals. Now everything uses ReadFilters, even for the required filters like being mapped for LocusWalkers. Statistics now tracked for each read filter used during the traversal and info emitted in INFO at the end. git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@3445 348d0f76-0448-11de-a6fe-93d51630548a --- .../gatk/datasources/providers/LocusView.java | 7 +- .../BlockDrivenSAMDataSource.java | 2 +- .../simpleDataSources/SAMDataSource.java | 4 +- .../gatk/executive/LinearMicroScheduler.java | 10 +- .../sting/gatk/executive/WindowMaker.java | 54 ++------ .../filters/CountingFilteringIterator.java | 127 ++++++++++++++++++ .../NotPrimaryAlignmentReadFilter.java | 43 ++++++ .../gatk/filters/TinyFragmentFilter.java | 84 ++++++++++++ .../gatk/filters/UnmappedReadFilter.java | 43 ++++++ .../gatk/traversals/TraversalEngine.java | 43 ++++-- .../gatk/traversals/TraversalStatistics.java | 17 +++ .../gatk/traversals/TraverseDuplicates.java | 35 +---- .../sting/gatk/walkers/DuplicateWalker.java | 23 ++++ .../sting/gatk/walkers/LocusWalker.java | 69 ++++++++++ .../sting/gatk/walkers/Walker.java | 11 ++ .../org/broadinstitute/sting/utils/Utils.java | 25 ++++ 16 files changed, 498 insertions(+), 99 deletions(-) create mode 100755 java/src/org/broadinstitute/sting/gatk/filters/CountingFilteringIterator.java create mode 100755 java/src/org/broadinstitute/sting/gatk/filters/NotPrimaryAlignmentReadFilter.java create mode 100755 java/src/org/broadinstitute/sting/gatk/filters/TinyFragmentFilter.java create mode 100755 java/src/org/broadinstitute/sting/gatk/filters/UnmappedReadFilter.java diff --git a/java/src/org/broadinstitute/sting/gatk/datasources/providers/LocusView.java b/java/src/org/broadinstitute/sting/gatk/datasources/providers/LocusView.java index f10fd3d02..886e4afab 100755 --- a/java/src/org/broadinstitute/sting/gatk/datasources/providers/LocusView.java +++ b/java/src/org/broadinstitute/sting/gatk/datasources/providers/LocusView.java @@ -1,20 +1,15 @@ package org.broadinstitute.sting.gatk.datasources.providers; -import net.sf.picard.filter.FilteringIterator; -import net.sf.picard.filter.SamRecordFilter; -import net.sf.samtools.SAMRecord; import org.broadinstitute.sting.gatk.Reads; import org.broadinstitute.sting.gatk.DownsampleType; import org.broadinstitute.sting.gatk.contexts.AlignmentContext; import org.broadinstitute.sting.gatk.iterators.LocusIterator; -import org.broadinstitute.sting.gatk.iterators.LocusIteratorByState; -import org.broadinstitute.sting.gatk.traversals.TraversalStatistics; import org.broadinstitute.sting.utils.GenomeLoc; import java.util.Arrays; import java.util.Collection; -import java.util.Iterator; import java.util.NoSuchElementException; + /** * User: hanna * Date: May 13, 2009 diff --git a/java/src/org/broadinstitute/sting/gatk/datasources/simpleDataSources/BlockDrivenSAMDataSource.java b/java/src/org/broadinstitute/sting/gatk/datasources/simpleDataSources/BlockDrivenSAMDataSource.java index 3d116e25a..5d010d788 100644 --- a/java/src/org/broadinstitute/sting/gatk/datasources/simpleDataSources/BlockDrivenSAMDataSource.java +++ b/java/src/org/broadinstitute/sting/gatk/datasources/simpleDataSources/BlockDrivenSAMDataSource.java @@ -255,7 +255,7 @@ public class BlockDrivenSAMDataSource extends SAMDataSource { continue; CloseableIterator iterator = readers.getReader(id).iterator(shard.getFileSpans().get(id)); if(shard.getFilter() != null) - iterator = new FilteringIterator(iterator,shard.getFilter()); + iterator = new FilteringIterator(iterator,shard.getFilter()); // not a counting iterator because we don't want to show the filtering of reads mergingIterator.addIterator(readers.getReader(id),iterator); } diff --git a/java/src/org/broadinstitute/sting/gatk/datasources/simpleDataSources/SAMDataSource.java b/java/src/org/broadinstitute/sting/gatk/datasources/simpleDataSources/SAMDataSource.java index 32775dbc6..ca13ebedb 100755 --- a/java/src/org/broadinstitute/sting/gatk/datasources/simpleDataSources/SAMDataSource.java +++ b/java/src/org/broadinstitute/sting/gatk/datasources/simpleDataSources/SAMDataSource.java @@ -2,13 +2,13 @@ package org.broadinstitute.sting.gatk.datasources.simpleDataSources; import net.sf.samtools.SAMFileHeader; import net.sf.samtools.SAMRecord; -import net.sf.picard.filter.FilteringIterator; import net.sf.picard.filter.SamRecordFilter; import org.apache.log4j.Logger; import org.broadinstitute.sting.gatk.datasources.shards.Shard; import org.broadinstitute.sting.gatk.iterators.*; import org.broadinstitute.sting.gatk.Reads; +import org.broadinstitute.sting.gatk.filters.CountingFilteringIterator; import org.broadinstitute.sting.utils.sam.SAMReadViolationHistogram; import java.io.File; @@ -209,7 +209,7 @@ public abstract class SAMDataSource implements SimpleDataSource { for( SamRecordFilter supplementalFilter: supplementalFilters ) wrappedIterator = StingSAMIteratorAdapter.adapt(wrappedIterator.getSourceInfo(), - new FilteringIterator(wrappedIterator,supplementalFilter)); + new CountingFilteringIterator(wrappedIterator,supplementalFilter)); return wrappedIterator; } diff --git a/java/src/org/broadinstitute/sting/gatk/executive/LinearMicroScheduler.java b/java/src/org/broadinstitute/sting/gatk/executive/LinearMicroScheduler.java index 807f12b9a..0db4feb20 100644 --- a/java/src/org/broadinstitute/sting/gatk/executive/LinearMicroScheduler.java +++ b/java/src/org/broadinstitute/sting/gatk/executive/LinearMicroScheduler.java @@ -11,14 +11,14 @@ import org.broadinstitute.sting.gatk.walkers.Walker; import org.broadinstitute.sting.gatk.io.DirectOutputTracker; import org.broadinstitute.sting.gatk.io.OutputTracker; import org.broadinstitute.sting.gatk.GenomeAnalysisEngine; -import org.broadinstitute.sting.gatk.iterators.LocusIteratorByState; -import org.broadinstitute.sting.gatk.iterators.LocusIterator; -import org.broadinstitute.sting.gatk.iterators.StingSAMIterator; +import org.broadinstitute.sting.gatk.traversals.TraversalEngine; import org.broadinstitute.sting.utils.fasta.IndexedFastaSequenceFile; import java.util.Collection; -import net.sf.picard.filter.FilteringIterator; +import net.sf.samtools.SAMRecord; +import net.sf.samtools.util.CloseableIterator; + /** A micro-scheduling manager for single-threaded execution of a traversal. */ public class LinearMicroScheduler extends MicroScheduler { @@ -57,7 +57,7 @@ public class LinearMicroScheduler extends MicroScheduler { for (Shard shard : shardStrategy) { // New experimental code for managing locus intervals. if(shard.getShardType() == Shard.ShardType.LOCUS || shard.getShardType() == Shard.ShardType.LOCUS_INTERVAL) { - WindowMaker windowMaker = new WindowMaker(getReadIterator(shard),shard.getGenomeLocs()); + WindowMaker windowMaker = new WindowMaker(getReadIterator(shard), shard.getGenomeLocs(), walker.getMandatoryReadFilters()); for(WindowMaker.WindowMakerIterator iterator: windowMaker) { ShardDataProvider dataProvider = new LocusShardDataProvider(shard,iterator.getSourceInfo(),iterator.getLocus(),iterator,reference,rods); Object result = traversalEngine.traverse(walker, dataProvider, accumulator.getReduceInit()); diff --git a/java/src/org/broadinstitute/sting/gatk/executive/WindowMaker.java b/java/src/org/broadinstitute/sting/gatk/executive/WindowMaker.java index d40078803..280bb5275 100644 --- a/java/src/org/broadinstitute/sting/gatk/executive/WindowMaker.java +++ b/java/src/org/broadinstitute/sting/gatk/executive/WindowMaker.java @@ -4,7 +4,9 @@ import org.broadinstitute.sting.utils.GenomeLoc; import org.broadinstitute.sting.gatk.iterators.*; import org.broadinstitute.sting.gatk.Reads; import org.broadinstitute.sting.gatk.DownsampleType; +import org.broadinstitute.sting.gatk.filters.CountingFilteringIterator; import org.broadinstitute.sting.gatk.traversals.TraversalStatistics; +import org.broadinstitute.sting.gatk.traversals.TraversalEngine; import org.broadinstitute.sting.gatk.contexts.AlignmentContext; import java.util.*; @@ -53,22 +55,27 @@ public class WindowMaker implements Iterable, I */ private boolean shardGenerated = false; + public WindowMaker(StingSAMIterator iterator, List intervals) { + this(iterator, intervals, new ArrayList()); + } + /** * Create a new window maker with the given iterator as a data source, covering * the given intervals. * @param iterator The data source for this window. * @param intervals The set of intervals over which to traverse. */ - public WindowMaker(StingSAMIterator iterator, List intervals) { + public WindowMaker(StingSAMIterator iterator, List intervals, List filters) { this.sourceInfo = iterator.getSourceInfo(); this.readIterator = iterator; LocusIterator locusIterator; + Iterator wrappedIterator = TraversalEngine.addMandatoryFilteringIterators(iterator, filters); if(sourceInfo.getDownsamplingMethod() != null && (sourceInfo.getDownsamplingMethod().type == DownsampleType.EXPERIMENTAL_BY_SAMPLE || sourceInfo.getDownsamplingMethod().type == DownsampleType.EXPERIMENTAL_NAIVE_DUPLICATE_ELIMINATOR)) - locusIterator = new DownsamplingLocusIteratorByState(new FilteringIterator(iterator,new LocusStreamFilterFunc()),sourceInfo); + locusIterator = new DownsamplingLocusIteratorByState(wrappedIterator,sourceInfo); else - locusIterator = new LocusIteratorByState(new FilteringIterator(iterator,new LocusStreamFilterFunc()),sourceInfo); + locusIterator = new LocusIteratorByState(wrappedIterator,sourceInfo); this.locusOverflowTracker = locusIterator.getLocusOverflowTracker(); @@ -144,45 +151,4 @@ public class WindowMaker implements Iterable, I sourceIterator.next(); } } - - /** - * Class to filter out un-handle-able reads from the stream. We currently are skipping - * unmapped reads, non-primary reads, unaligned reads, and duplicate reads. - */ - private static class LocusStreamFilterFunc implements SamRecordFilter { - SAMRecord lastRead = null; - public boolean filterOut(SAMRecord rec) { - boolean result = false; - String why = ""; - if (rec.getReadUnmappedFlag()) { - TraversalStatistics.nUnmappedReads++; - result = true; - why = "Unmapped"; - } else if (rec.getNotPrimaryAlignmentFlag()) { - TraversalStatistics.nNotPrimary++; - result = true; - why = "Not Primary"; - } else if (rec.getAlignmentStart() == SAMRecord.NO_ALIGNMENT_START) { - TraversalStatistics.nBadAlignments++; - result = true; - why = "No alignment start"; - } else if (rec.getDuplicateReadFlag()) { - TraversalStatistics.nDuplicates++; - result = true; - why = "Duplicate reads"; - } - else { - result = false; - } - - if (result) { - TraversalStatistics.nSkippedReads++; - //System.out.printf(" [filter] %s => %b %s", rec.getReadName(), result, why); - } else { - TraversalStatistics.nReads++; - } - return result; - } - } - } diff --git a/java/src/org/broadinstitute/sting/gatk/filters/CountingFilteringIterator.java b/java/src/org/broadinstitute/sting/gatk/filters/CountingFilteringIterator.java new file mode 100755 index 000000000..f69b0749f --- /dev/null +++ b/java/src/org/broadinstitute/sting/gatk/filters/CountingFilteringIterator.java @@ -0,0 +1,127 @@ +/* + * Copyright (c) 2010, The Broad Institute + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, + * copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following + * conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES + * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT + * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + */ +package org.broadinstitute.sting.gatk.filters; + +import net.sf.samtools.util.CloserUtil; + +import java.util.Iterator; +import java.util.NoSuchElementException; + +import net.sf.samtools.SAMRecord; +import net.sf.samtools.util.CloseableIterator; +import net.sf.picard.filter.SamRecordFilter; +import org.broadinstitute.sting.gatk.traversals.TraversalStatistics; + +/** + * Filtering Iterator which takes a filter and an iterator and iterates + * through only those records which are not rejected by the filter. + * + * Just a copy of a unmodifiable FilteringIterator from Picard + * + * @author Mark DePristo + */ +public class CountingFilteringIterator implements CloseableIterator { + private final Iterator iterator; + private final SamRecordFilter filter; + private SAMRecord next = null; + + /** + * Constructor + * + * @param iterator the backing iterator + * @param filter the filter (which may be a FilterAggregator) + */ + public CountingFilteringIterator(Iterator iterator, SamRecordFilter filter) { + this.iterator = iterator; + this.filter = filter; + next = getNextRecord(); + } + + /** + * Special case to count passing records + * @param iterator + */ + public CountingFilteringIterator(Iterator iterator) { + this(iterator, null); + } + + /** + * Returns true if the iteration has more elements. + * + * @return true if the iteration has more elements. Otherwise returns false. + */ + public boolean hasNext() { + return next != null; + } + + /** + * Returns the next element in the iteration. + * + * @return the next element in the iteration + * @throws java.util.NoSuchElementException + */ + public SAMRecord next() { + if (next == null) { + throw new NoSuchElementException("Iterator has no more elements."); + } + final SAMRecord result = next; + next = getNextRecord(); + return result; + } + + /** + * Required method for Iterator API. + * + * @throws UnsupportedOperationException + */ + public void remove() { + throw new UnsupportedOperationException("Remove() not supported by CountingFilteringIterator"); + } + + public void close() { + CloserUtil.close(iterator); + } + + /** + * Gets the next record from the underlying iterator that passes the filter + * + * @return SAMRecord the next filter-passing record + */ + private SAMRecord getNextRecord() { + while (iterator.hasNext()) { + SAMRecord record = iterator.next(); + + if ( filter == null ) { + TraversalStatistics.nReads++; + return record; + } else if (!filter.filterOut(record)) { + return record; + } else { + TraversalStatistics.incrementFilter(filter); + } + } + + return null; + } +} \ No newline at end of file diff --git a/java/src/org/broadinstitute/sting/gatk/filters/NotPrimaryAlignmentReadFilter.java b/java/src/org/broadinstitute/sting/gatk/filters/NotPrimaryAlignmentReadFilter.java new file mode 100755 index 000000000..8837f4e28 --- /dev/null +++ b/java/src/org/broadinstitute/sting/gatk/filters/NotPrimaryAlignmentReadFilter.java @@ -0,0 +1,43 @@ +/* + * Copyright (c) 2010, The Broad Institute + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, + * copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following + * conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES + * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT + * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + */ + +package org.broadinstitute.sting.gatk.filters; + +import net.sf.picard.filter.SamRecordFilter; +import net.sf.samtools.SAMRecord; +import org.broadinstitute.sting.gatk.traversals.TraversalStatistics; + +/** + * Created by IntelliJ IDEA. + * User: rpoplin + * Date: Dec 9, 2009 + * + * Filter out duplicate reads. + */ + +public class NotPrimaryAlignmentReadFilter implements SamRecordFilter { + public boolean filterOut( final SAMRecord read ) { + return read.getNotPrimaryAlignmentFlag(); + } +} \ No newline at end of file diff --git a/java/src/org/broadinstitute/sting/gatk/filters/TinyFragmentFilter.java b/java/src/org/broadinstitute/sting/gatk/filters/TinyFragmentFilter.java new file mode 100755 index 000000000..a0ed6cbf5 --- /dev/null +++ b/java/src/org/broadinstitute/sting/gatk/filters/TinyFragmentFilter.java @@ -0,0 +1,84 @@ +/* + * Copyright (c) 2010, The Broad Institute + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, + * copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following + * conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES + * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT + * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + */ + +package org.broadinstitute.sting.gatk.filters; + +import net.sf.picard.filter.SamRecordFilter; +import net.sf.samtools.SAMRecord; + +/** + * Filters out read pairs where the reads are so long relative to the over fragment size that they are + * reading into each other's adaptors. + * + * Normally, fragments are sufficiently far apart that reads aren't reading into each other. + * + * |--------------------> first read + * <--------------------| second read + * + * Sometimes, mostly due to lab errors or constraints, fragment library are made too short relative to the + * length of the reads. For example, it's possible to have 76bp PE reads with 125 bp inserts, so that ~25 bp of each + * read overlaps with its mate. + * + * |--------------------> first read + * <--------------------| second read + * + * This filter deals with the situation where the fragment is so small that the each read actually reads into the + * adaptor sequence of its mate, generating mismatches at both ends of the read: + * + * |----------------XXXX> first read + * left read + * mateEnd; + //System.out.printf("%s %d %d %d %d %d => %b%n", rec.getReadName(), start, end, mateStart, mateEnd, isize, bad); + if ( bad ) { + //System.out.printf("TinyFragment: " + rec.format()); + return true; + } else { + return false; + } + } + } +} \ No newline at end of file diff --git a/java/src/org/broadinstitute/sting/gatk/filters/UnmappedReadFilter.java b/java/src/org/broadinstitute/sting/gatk/filters/UnmappedReadFilter.java new file mode 100755 index 000000000..80cbb7024 --- /dev/null +++ b/java/src/org/broadinstitute/sting/gatk/filters/UnmappedReadFilter.java @@ -0,0 +1,43 @@ +/* + * Copyright (c) 2010, The Broad Institute + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, + * copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following + * conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES + * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT + * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + */ + +package org.broadinstitute.sting.gatk.filters; + +import net.sf.picard.filter.SamRecordFilter; +import net.sf.samtools.SAMRecord; +import org.broadinstitute.sting.gatk.traversals.TraversalStatistics; + +/** + * Created by IntelliJ IDEA. + * User: rpoplin + * Date: Dec 9, 2009 + * + * Filter out duplicate reads. + */ + +public class UnmappedReadFilter implements SamRecordFilter { + public boolean filterOut( final SAMRecord read ) { + return read.getReadUnmappedFlag() || read.getAlignmentStart() == SAMRecord.NO_ALIGNMENT_START; + } +} \ No newline at end of file diff --git a/java/src/org/broadinstitute/sting/gatk/traversals/TraversalEngine.java b/java/src/org/broadinstitute/sting/gatk/traversals/TraversalEngine.java index e31f7d3d5..b84f94d5b 100755 --- a/java/src/org/broadinstitute/sting/gatk/traversals/TraversalEngine.java +++ b/java/src/org/broadinstitute/sting/gatk/traversals/TraversalEngine.java @@ -3,7 +3,17 @@ package org.broadinstitute.sting.gatk.traversals; import org.apache.log4j.Logger; import org.broadinstitute.sting.gatk.datasources.providers.ShardDataProvider; import org.broadinstitute.sting.gatk.walkers.Walker; +import org.broadinstitute.sting.gatk.filters.CountingFilteringIterator; import org.broadinstitute.sting.utils.GenomeLoc; +import org.broadinstitute.sting.utils.Utils; +import org.broadinstitute.sting.utils.MathUtils; + +import java.util.Map; +import java.util.List; +import java.util.Iterator; + +import net.sf.picard.filter.SamRecordFilter; +import net.sf.samtools.SAMRecord; public abstract class TraversalEngine,ProviderType extends ShardDataProvider> { // Time in milliseconds since we initialized this engine @@ -88,16 +98,22 @@ public abstract class TraversalEngine,Provide printProgress(true, type, null); final long curTime = System.currentTimeMillis(); final double elapsed = (curTime - startTime) / 1000.0; + + // count up the number of skipped reads by summing over all filters + long nSkippedReads = 0L; + for ( long counts : TraversalStatistics.counter.values() ) + nSkippedReads += counts; + logger.info(String.format("Total runtime %.2f secs, %.2f min, %.2f hours%n", elapsed, elapsed / 60, elapsed / 3600)); - logger.info(String.format("Traversal skipped %d valid reads out of %d total (%.2f%%)", - TraversalStatistics.nSkippedReads, + logger.info(String.format("%d reads were filtered out during traversal out of %d total (%.2f%%)", + nSkippedReads, TraversalStatistics.nReads, - (TraversalStatistics.nSkippedReads * 100.0) / TraversalStatistics.nReads)); - logger.info(String.format(" -> %d unmapped reads", TraversalStatistics.nUnmappedReads)); - logger.info(String.format(" -> %d duplicate reads", TraversalStatistics.nDuplicates)); - logger.info(String.format(" -> %d reads with non-primary alignments", TraversalStatistics.nNotPrimary)); - logger.info(String.format(" -> %d reads with bad alignments", TraversalStatistics.nBadAlignments)); - logger.info(String.format(" -> %d reads with indels", TraversalStatistics.nSkippedIndels)); + 100.0 * MathUtils.ratio(nSkippedReads, TraversalStatistics.nReads))); + for ( Map.Entry filterCounts : TraversalStatistics.counter.entrySet() ) { + long count = filterCounts.getValue(); + logger.info(String.format(" -> %d reads (%.2f%% of total) failing %s", + count, 100.0 * MathUtils.ratio(count, TraversalStatistics.nReads), Utils.getClassName(filterCounts.getKey()))); + } } /** Initialize the traversal engine. After this point traversals can be run over the data */ @@ -117,4 +133,15 @@ public abstract class TraversalEngine,Provide public abstract T traverse(WalkerType walker, ProviderType dataProvider, T sum); + + public static Iterator addMandatoryFilteringIterators(Iterator iter, List filters ) { + for( SamRecordFilter filter : filters ) { + //logger.debug("Adding filter " + filter.getClass()); + iter = new CountingFilteringIterator(iter,filter); + } + + return new CountingFilteringIterator(iter); // special case to count all reads + } + + } diff --git a/java/src/org/broadinstitute/sting/gatk/traversals/TraversalStatistics.java b/java/src/org/broadinstitute/sting/gatk/traversals/TraversalStatistics.java index ee8a3e684..c49bd2f19 100755 --- a/java/src/org/broadinstitute/sting/gatk/traversals/TraversalStatistics.java +++ b/java/src/org/broadinstitute/sting/gatk/traversals/TraversalStatistics.java @@ -1,5 +1,12 @@ package org.broadinstitute.sting.gatk.traversals; +import net.sf.picard.filter.SamRecordFilter; + +import java.util.Map; +import java.util.HashMap; + +import org.broadinstitute.sting.utils.Utils; + /** * Created by IntelliJ IDEA. * User: hanna @@ -20,11 +27,21 @@ public class TraversalStatistics { public static long nBadAlignments; public static long nSkippedIndels; public static long nDuplicates; + public static Map counter = new HashMap(); static { reset(); } + public static void incrementFilter(SamRecordFilter filter) { + long c = 0; + if ( counter.containsKey(filter.getClass()) ) { + c = counter.get(filter.getClass()); + } + + counter.put(filter.getClass(), c + 1L); + } + public static void reset() { nRecords = 0; nReads = 0; diff --git a/java/src/org/broadinstitute/sting/gatk/traversals/TraverseDuplicates.java b/java/src/org/broadinstitute/sting/gatk/traversals/TraverseDuplicates.java index 1075cb30a..4f16a3b3d 100755 --- a/java/src/org/broadinstitute/sting/gatk/traversals/TraverseDuplicates.java +++ b/java/src/org/broadinstitute/sting/gatk/traversals/TraverseDuplicates.java @@ -25,9 +25,9 @@ package org.broadinstitute.sting.gatk.traversals; -import net.sf.picard.filter.FilteringIterator; import net.sf.picard.filter.SamRecordFilter; import net.sf.samtools.SAMRecord; +import net.sf.samtools.util.CloseableIterator; import org.apache.log4j.Logger; import org.broadinstitute.sting.gatk.contexts.AlignmentContext; import org.broadinstitute.sting.gatk.datasources.providers.ReadView; @@ -148,37 +148,6 @@ public class TraverseDuplicates extends TraversalEngine %b", rec.getReadName(), result); - } else { - TraversalStatistics.nReads++; - } - return result; - } - } - // -------------------------------------------------------------------------------------------------------------- // // new style interface to the system @@ -196,7 +165,7 @@ public class TraverseDuplicates extends TraversalEngine walker, ReadShardDataProvider dataProvider, T sum) { - FilteringIterator filterIter = new FilteringIterator(new ReadView(dataProvider).iterator(), new duplicateStreamFilterFunc()); + Iterator filterIter = addMandatoryFilteringIterators(new ReadView(dataProvider).iterator(), walker.getMandatoryReadFilters()); PushbackIterator iter = new PushbackIterator(filterIter); /** diff --git a/java/src/org/broadinstitute/sting/gatk/walkers/DuplicateWalker.java b/java/src/org/broadinstitute/sting/gatk/walkers/DuplicateWalker.java index 61dda61bd..d9a7a7121 100755 --- a/java/src/org/broadinstitute/sting/gatk/walkers/DuplicateWalker.java +++ b/java/src/org/broadinstitute/sting/gatk/walkers/DuplicateWalker.java @@ -1,12 +1,19 @@ package org.broadinstitute.sting.gatk.walkers; import org.broadinstitute.sting.gatk.contexts.AlignmentContext; +import org.broadinstitute.sting.gatk.traversals.TraversalStatistics; +import org.broadinstitute.sting.gatk.filters.UnmappedReadFilter; +import org.broadinstitute.sting.gatk.filters.NotPrimaryAlignmentReadFilter; +import org.broadinstitute.sting.gatk.filters.DuplicateReadFilter; import org.broadinstitute.sting.utils.GenomeLoc; import java.util.List; import java.util.Set; +import java.util.ArrayList; +import java.util.Arrays; import net.sf.samtools.SAMRecord; +import net.sf.picard.filter.SamRecordFilter; /** * Created by IntelliJ IDEA. @@ -27,4 +34,20 @@ public abstract class DuplicateWalker extends Walker getMandatoryReadFilters() { + SamRecordFilter filter1 = new UnmappedReadFilter(); + SamRecordFilter filter2 = new NotPrimaryAlignmentReadFilter(); + List x = super.getMandatoryReadFilters(); + + x.addAll(Arrays.asList(filter2, filter1)); + return x; + + } } \ No newline at end of file diff --git a/java/src/org/broadinstitute/sting/gatk/walkers/LocusWalker.java b/java/src/org/broadinstitute/sting/gatk/walkers/LocusWalker.java index f066da13e..d8d5adefb 100755 --- a/java/src/org/broadinstitute/sting/gatk/walkers/LocusWalker.java +++ b/java/src/org/broadinstitute/sting/gatk/walkers/LocusWalker.java @@ -3,6 +3,15 @@ package org.broadinstitute.sting.gatk.walkers; import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; import org.broadinstitute.sting.gatk.contexts.AlignmentContext; import org.broadinstitute.sting.gatk.contexts.ReferenceContext; +import org.broadinstitute.sting.gatk.traversals.TraversalStatistics; +import org.broadinstitute.sting.gatk.filters.UnmappedReadFilter; +import org.broadinstitute.sting.gatk.filters.NotPrimaryAlignmentReadFilter; +import org.broadinstitute.sting.gatk.filters.DuplicateReadFilter; +import net.sf.picard.filter.SamRecordFilter; +import net.sf.samtools.SAMRecord; + +import java.util.List; +import java.util.Arrays; /** * Created by IntelliJ IDEA. @@ -21,4 +30,64 @@ public abstract class LocusWalker extends Walker getMandatoryReadFilters() { +// if ( false ) { +// SamRecordFilter filter = new LocusStreamFilterFunc(); +// return Arrays.asList(filter); +// } else { + SamRecordFilter filter1 = new UnmappedReadFilter(); + SamRecordFilter filter2 = new NotPrimaryAlignmentReadFilter(); + SamRecordFilter filter3 = new DuplicateReadFilter(); + List x = super.getMandatoryReadFilters(); + x.addAll(Arrays.asList(filter3, filter2, filter1)); +// } + return x; + } + + /** + * Class to filter out un-handle-able reads from the stream. We currently are skipping + * unmapped reads, non-primary reads, unaligned reads, and duplicate reads. + */ + private static class LocusStreamFilterFunc implements SamRecordFilter { + SAMRecord lastRead = null; + public boolean filterOut(SAMRecord rec) { + boolean result = false; + if (rec.getReadUnmappedFlag()) { + TraversalStatistics.nUnmappedReads++; + result = true; + //why = "Unmapped"; + } else if (rec.getNotPrimaryAlignmentFlag()) { + TraversalStatistics.nNotPrimary++; + result = true; +// why = "Not Primary"; + } else if (rec.getAlignmentStart() == SAMRecord.NO_ALIGNMENT_START) { + TraversalStatistics.nBadAlignments++; + result = true; +// why = "No alignment start"; + } else if (rec.getDuplicateReadFlag()) { + TraversalStatistics.nDuplicates++; + result = true; +// why = "Duplicate reads"; + } + else { + result = false; + } + + if (result) { + TraversalStatistics.nSkippedReads++; + //System.out.printf(" [filter] %s => %b %s", rec.getReadName(), result, why); + } else { + TraversalStatistics.nReads++; + } + + return result; + } + } } diff --git a/java/src/org/broadinstitute/sting/gatk/walkers/Walker.java b/java/src/org/broadinstitute/sting/gatk/walkers/Walker.java index e2650ecae..9c9b07c20 100755 --- a/java/src/org/broadinstitute/sting/gatk/walkers/Walker.java +++ b/java/src/org/broadinstitute/sting/gatk/walkers/Walker.java @@ -27,11 +27,13 @@ package org.broadinstitute.sting.gatk.walkers; import java.io.PrintStream; import java.util.List; +import java.util.ArrayList; import org.broadinstitute.sting.gatk.GenomeAnalysisEngine; import org.broadinstitute.sting.utils.GenomeLoc; import org.broadinstitute.sting.utils.collections.Pair; import org.apache.log4j.Logger; +import net.sf.picard.filter.SamRecordFilter; /** * Created by IntelliJ IDEA. @@ -133,6 +135,15 @@ public abstract class Walker { out.println("[REDUCE RESULT] Traversal result is: " + result); } + + /** + * Returns a list of SamRecordFilters that *must* be applied to the read stream for the traversal to work + * @return a list of SamRecordFilters to apply in order + */ + public List getMandatoryReadFilters() { + return new ArrayList(); // by default + } + /** * General interval reduce routine called after all of the traversals are done * @param results diff --git a/java/src/org/broadinstitute/sting/utils/Utils.java b/java/src/org/broadinstitute/sting/utils/Utils.java index 9780acc9f..2f65f9cee 100755 --- a/java/src/org/broadinstitute/sting/utils/Utils.java +++ b/java/src/org/broadinstitute/sting/utils/Utils.java @@ -49,6 +49,31 @@ public class Utils { /** our log, which we want to capture anything from this class */ private static Logger logger = Logger.getLogger(Utils.class); + public static String getClassName(Class c) { + String FQClassName = c.getName(); + int firstChar; + firstChar = FQClassName.lastIndexOf ('.') + 1; + if ( firstChar > 0 ) { + FQClassName = FQClassName.substring ( firstChar ); + } + return FQClassName; + } + + + // returns package and class name + public static String getFullClassName(Class c) { + return c.getName(); + } + + // returns the package without the classname, empty string if + // there is no package + public static String getPackageName(Class c) { + String fullyQualifiedName = c.getName(); + int lastDot = fullyQualifiedName.lastIndexOf ('.'); + if (lastDot==-1){ return ""; } + return fullyQualifiedName.substring (0, lastDot); + } + public static void warnUser(final String msg) { logger.warn(String.format("********************************************************************************")); logger.warn(String.format("* WARNING:"));