diff --git a/java/src/org/broadinstitute/sting/gatk/datasources/providers/LocusView.java b/java/src/org/broadinstitute/sting/gatk/datasources/providers/LocusView.java index f10fd3d02..886e4afab 100755 --- a/java/src/org/broadinstitute/sting/gatk/datasources/providers/LocusView.java +++ b/java/src/org/broadinstitute/sting/gatk/datasources/providers/LocusView.java @@ -1,20 +1,15 @@ package org.broadinstitute.sting.gatk.datasources.providers; -import net.sf.picard.filter.FilteringIterator; -import net.sf.picard.filter.SamRecordFilter; -import net.sf.samtools.SAMRecord; import org.broadinstitute.sting.gatk.Reads; import org.broadinstitute.sting.gatk.DownsampleType; import org.broadinstitute.sting.gatk.contexts.AlignmentContext; import org.broadinstitute.sting.gatk.iterators.LocusIterator; -import org.broadinstitute.sting.gatk.iterators.LocusIteratorByState; -import org.broadinstitute.sting.gatk.traversals.TraversalStatistics; import org.broadinstitute.sting.utils.GenomeLoc; import java.util.Arrays; import java.util.Collection; -import java.util.Iterator; import java.util.NoSuchElementException; + /** * User: hanna * Date: May 13, 2009 diff --git a/java/src/org/broadinstitute/sting/gatk/datasources/simpleDataSources/BlockDrivenSAMDataSource.java b/java/src/org/broadinstitute/sting/gatk/datasources/simpleDataSources/BlockDrivenSAMDataSource.java index 3d116e25a..5d010d788 100644 --- a/java/src/org/broadinstitute/sting/gatk/datasources/simpleDataSources/BlockDrivenSAMDataSource.java +++ b/java/src/org/broadinstitute/sting/gatk/datasources/simpleDataSources/BlockDrivenSAMDataSource.java @@ -255,7 +255,7 @@ public class BlockDrivenSAMDataSource extends SAMDataSource { continue; CloseableIterator iterator = readers.getReader(id).iterator(shard.getFileSpans().get(id)); if(shard.getFilter() != null) - iterator = new FilteringIterator(iterator,shard.getFilter()); + iterator = new FilteringIterator(iterator,shard.getFilter()); // not a counting iterator because we don't want to show the filtering of reads mergingIterator.addIterator(readers.getReader(id),iterator); } diff --git a/java/src/org/broadinstitute/sting/gatk/datasources/simpleDataSources/SAMDataSource.java b/java/src/org/broadinstitute/sting/gatk/datasources/simpleDataSources/SAMDataSource.java index 32775dbc6..ca13ebedb 100755 --- a/java/src/org/broadinstitute/sting/gatk/datasources/simpleDataSources/SAMDataSource.java +++ b/java/src/org/broadinstitute/sting/gatk/datasources/simpleDataSources/SAMDataSource.java @@ -2,13 +2,13 @@ package org.broadinstitute.sting.gatk.datasources.simpleDataSources; import net.sf.samtools.SAMFileHeader; import net.sf.samtools.SAMRecord; -import net.sf.picard.filter.FilteringIterator; import net.sf.picard.filter.SamRecordFilter; import org.apache.log4j.Logger; import org.broadinstitute.sting.gatk.datasources.shards.Shard; import org.broadinstitute.sting.gatk.iterators.*; import org.broadinstitute.sting.gatk.Reads; +import org.broadinstitute.sting.gatk.filters.CountingFilteringIterator; import org.broadinstitute.sting.utils.sam.SAMReadViolationHistogram; import java.io.File; @@ -209,7 +209,7 @@ public abstract class SAMDataSource implements SimpleDataSource { for( SamRecordFilter supplementalFilter: supplementalFilters ) wrappedIterator = StingSAMIteratorAdapter.adapt(wrappedIterator.getSourceInfo(), - new FilteringIterator(wrappedIterator,supplementalFilter)); + new CountingFilteringIterator(wrappedIterator,supplementalFilter)); return wrappedIterator; } diff --git a/java/src/org/broadinstitute/sting/gatk/executive/LinearMicroScheduler.java b/java/src/org/broadinstitute/sting/gatk/executive/LinearMicroScheduler.java index 807f12b9a..0db4feb20 100644 --- a/java/src/org/broadinstitute/sting/gatk/executive/LinearMicroScheduler.java +++ b/java/src/org/broadinstitute/sting/gatk/executive/LinearMicroScheduler.java @@ -11,14 +11,14 @@ import org.broadinstitute.sting.gatk.walkers.Walker; import org.broadinstitute.sting.gatk.io.DirectOutputTracker; import org.broadinstitute.sting.gatk.io.OutputTracker; import org.broadinstitute.sting.gatk.GenomeAnalysisEngine; -import org.broadinstitute.sting.gatk.iterators.LocusIteratorByState; -import org.broadinstitute.sting.gatk.iterators.LocusIterator; -import org.broadinstitute.sting.gatk.iterators.StingSAMIterator; +import org.broadinstitute.sting.gatk.traversals.TraversalEngine; import org.broadinstitute.sting.utils.fasta.IndexedFastaSequenceFile; import java.util.Collection; -import net.sf.picard.filter.FilteringIterator; +import net.sf.samtools.SAMRecord; +import net.sf.samtools.util.CloseableIterator; + /** A micro-scheduling manager for single-threaded execution of a traversal. */ public class LinearMicroScheduler extends MicroScheduler { @@ -57,7 +57,7 @@ public class LinearMicroScheduler extends MicroScheduler { for (Shard shard : shardStrategy) { // New experimental code for managing locus intervals. if(shard.getShardType() == Shard.ShardType.LOCUS || shard.getShardType() == Shard.ShardType.LOCUS_INTERVAL) { - WindowMaker windowMaker = new WindowMaker(getReadIterator(shard),shard.getGenomeLocs()); + WindowMaker windowMaker = new WindowMaker(getReadIterator(shard), shard.getGenomeLocs(), walker.getMandatoryReadFilters()); for(WindowMaker.WindowMakerIterator iterator: windowMaker) { ShardDataProvider dataProvider = new LocusShardDataProvider(shard,iterator.getSourceInfo(),iterator.getLocus(),iterator,reference,rods); Object result = traversalEngine.traverse(walker, dataProvider, accumulator.getReduceInit()); diff --git a/java/src/org/broadinstitute/sting/gatk/executive/WindowMaker.java b/java/src/org/broadinstitute/sting/gatk/executive/WindowMaker.java index d40078803..280bb5275 100644 --- a/java/src/org/broadinstitute/sting/gatk/executive/WindowMaker.java +++ b/java/src/org/broadinstitute/sting/gatk/executive/WindowMaker.java @@ -4,7 +4,9 @@ import org.broadinstitute.sting.utils.GenomeLoc; import org.broadinstitute.sting.gatk.iterators.*; import org.broadinstitute.sting.gatk.Reads; import org.broadinstitute.sting.gatk.DownsampleType; +import org.broadinstitute.sting.gatk.filters.CountingFilteringIterator; import org.broadinstitute.sting.gatk.traversals.TraversalStatistics; +import org.broadinstitute.sting.gatk.traversals.TraversalEngine; import org.broadinstitute.sting.gatk.contexts.AlignmentContext; import java.util.*; @@ -53,22 +55,27 @@ public class WindowMaker implements Iterable, I */ private boolean shardGenerated = false; + public WindowMaker(StingSAMIterator iterator, List intervals) { + this(iterator, intervals, new ArrayList()); + } + /** * Create a new window maker with the given iterator as a data source, covering * the given intervals. * @param iterator The data source for this window. * @param intervals The set of intervals over which to traverse. */ - public WindowMaker(StingSAMIterator iterator, List intervals) { + public WindowMaker(StingSAMIterator iterator, List intervals, List filters) { this.sourceInfo = iterator.getSourceInfo(); this.readIterator = iterator; LocusIterator locusIterator; + Iterator wrappedIterator = TraversalEngine.addMandatoryFilteringIterators(iterator, filters); if(sourceInfo.getDownsamplingMethod() != null && (sourceInfo.getDownsamplingMethod().type == DownsampleType.EXPERIMENTAL_BY_SAMPLE || sourceInfo.getDownsamplingMethod().type == DownsampleType.EXPERIMENTAL_NAIVE_DUPLICATE_ELIMINATOR)) - locusIterator = new DownsamplingLocusIteratorByState(new FilteringIterator(iterator,new LocusStreamFilterFunc()),sourceInfo); + locusIterator = new DownsamplingLocusIteratorByState(wrappedIterator,sourceInfo); else - locusIterator = new LocusIteratorByState(new FilteringIterator(iterator,new LocusStreamFilterFunc()),sourceInfo); + locusIterator = new LocusIteratorByState(wrappedIterator,sourceInfo); this.locusOverflowTracker = locusIterator.getLocusOverflowTracker(); @@ -144,45 +151,4 @@ public class WindowMaker implements Iterable, I sourceIterator.next(); } } - - /** - * Class to filter out un-handle-able reads from the stream. We currently are skipping - * unmapped reads, non-primary reads, unaligned reads, and duplicate reads. - */ - private static class LocusStreamFilterFunc implements SamRecordFilter { - SAMRecord lastRead = null; - public boolean filterOut(SAMRecord rec) { - boolean result = false; - String why = ""; - if (rec.getReadUnmappedFlag()) { - TraversalStatistics.nUnmappedReads++; - result = true; - why = "Unmapped"; - } else if (rec.getNotPrimaryAlignmentFlag()) { - TraversalStatistics.nNotPrimary++; - result = true; - why = "Not Primary"; - } else if (rec.getAlignmentStart() == SAMRecord.NO_ALIGNMENT_START) { - TraversalStatistics.nBadAlignments++; - result = true; - why = "No alignment start"; - } else if (rec.getDuplicateReadFlag()) { - TraversalStatistics.nDuplicates++; - result = true; - why = "Duplicate reads"; - } - else { - result = false; - } - - if (result) { - TraversalStatistics.nSkippedReads++; - //System.out.printf(" [filter] %s => %b %s", rec.getReadName(), result, why); - } else { - TraversalStatistics.nReads++; - } - return result; - } - } - } diff --git a/java/src/org/broadinstitute/sting/gatk/filters/CountingFilteringIterator.java b/java/src/org/broadinstitute/sting/gatk/filters/CountingFilteringIterator.java new file mode 100755 index 000000000..f69b0749f --- /dev/null +++ b/java/src/org/broadinstitute/sting/gatk/filters/CountingFilteringIterator.java @@ -0,0 +1,127 @@ +/* + * Copyright (c) 2010, The Broad Institute + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, + * copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following + * conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES + * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT + * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + */ +package org.broadinstitute.sting.gatk.filters; + +import net.sf.samtools.util.CloserUtil; + +import java.util.Iterator; +import java.util.NoSuchElementException; + +import net.sf.samtools.SAMRecord; +import net.sf.samtools.util.CloseableIterator; +import net.sf.picard.filter.SamRecordFilter; +import org.broadinstitute.sting.gatk.traversals.TraversalStatistics; + +/** + * Filtering Iterator which takes a filter and an iterator and iterates + * through only those records which are not rejected by the filter. + * + * Just a copy of a unmodifiable FilteringIterator from Picard + * + * @author Mark DePristo + */ +public class CountingFilteringIterator implements CloseableIterator { + private final Iterator iterator; + private final SamRecordFilter filter; + private SAMRecord next = null; + + /** + * Constructor + * + * @param iterator the backing iterator + * @param filter the filter (which may be a FilterAggregator) + */ + public CountingFilteringIterator(Iterator iterator, SamRecordFilter filter) { + this.iterator = iterator; + this.filter = filter; + next = getNextRecord(); + } + + /** + * Special case to count passing records + * @param iterator + */ + public CountingFilteringIterator(Iterator iterator) { + this(iterator, null); + } + + /** + * Returns true if the iteration has more elements. + * + * @return true if the iteration has more elements. Otherwise returns false. + */ + public boolean hasNext() { + return next != null; + } + + /** + * Returns the next element in the iteration. + * + * @return the next element in the iteration + * @throws java.util.NoSuchElementException + */ + public SAMRecord next() { + if (next == null) { + throw new NoSuchElementException("Iterator has no more elements."); + } + final SAMRecord result = next; + next = getNextRecord(); + return result; + } + + /** + * Required method for Iterator API. + * + * @throws UnsupportedOperationException + */ + public void remove() { + throw new UnsupportedOperationException("Remove() not supported by CountingFilteringIterator"); + } + + public void close() { + CloserUtil.close(iterator); + } + + /** + * Gets the next record from the underlying iterator that passes the filter + * + * @return SAMRecord the next filter-passing record + */ + private SAMRecord getNextRecord() { + while (iterator.hasNext()) { + SAMRecord record = iterator.next(); + + if ( filter == null ) { + TraversalStatistics.nReads++; + return record; + } else if (!filter.filterOut(record)) { + return record; + } else { + TraversalStatistics.incrementFilter(filter); + } + } + + return null; + } +} \ No newline at end of file diff --git a/java/src/org/broadinstitute/sting/gatk/filters/NotPrimaryAlignmentReadFilter.java b/java/src/org/broadinstitute/sting/gatk/filters/NotPrimaryAlignmentReadFilter.java new file mode 100755 index 000000000..8837f4e28 --- /dev/null +++ b/java/src/org/broadinstitute/sting/gatk/filters/NotPrimaryAlignmentReadFilter.java @@ -0,0 +1,43 @@ +/* + * Copyright (c) 2010, The Broad Institute + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, + * copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following + * conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES + * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT + * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + */ + +package org.broadinstitute.sting.gatk.filters; + +import net.sf.picard.filter.SamRecordFilter; +import net.sf.samtools.SAMRecord; +import org.broadinstitute.sting.gatk.traversals.TraversalStatistics; + +/** + * Created by IntelliJ IDEA. + * User: rpoplin + * Date: Dec 9, 2009 + * + * Filter out duplicate reads. + */ + +public class NotPrimaryAlignmentReadFilter implements SamRecordFilter { + public boolean filterOut( final SAMRecord read ) { + return read.getNotPrimaryAlignmentFlag(); + } +} \ No newline at end of file diff --git a/java/src/org/broadinstitute/sting/gatk/filters/TinyFragmentFilter.java b/java/src/org/broadinstitute/sting/gatk/filters/TinyFragmentFilter.java new file mode 100755 index 000000000..a0ed6cbf5 --- /dev/null +++ b/java/src/org/broadinstitute/sting/gatk/filters/TinyFragmentFilter.java @@ -0,0 +1,84 @@ +/* + * Copyright (c) 2010, The Broad Institute + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, + * copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following + * conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES + * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT + * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + */ + +package org.broadinstitute.sting.gatk.filters; + +import net.sf.picard.filter.SamRecordFilter; +import net.sf.samtools.SAMRecord; + +/** + * Filters out read pairs where the reads are so long relative to the over fragment size that they are + * reading into each other's adaptors. + * + * Normally, fragments are sufficiently far apart that reads aren't reading into each other. + * + * |--------------------> first read + * <--------------------| second read + * + * Sometimes, mostly due to lab errors or constraints, fragment library are made too short relative to the + * length of the reads. For example, it's possible to have 76bp PE reads with 125 bp inserts, so that ~25 bp of each + * read overlaps with its mate. + * + * |--------------------> first read + * <--------------------| second read + * + * This filter deals with the situation where the fragment is so small that the each read actually reads into the + * adaptor sequence of its mate, generating mismatches at both ends of the read: + * + * |----------------XXXX> first read + * left read + * mateEnd; + //System.out.printf("%s %d %d %d %d %d => %b%n", rec.getReadName(), start, end, mateStart, mateEnd, isize, bad); + if ( bad ) { + //System.out.printf("TinyFragment: " + rec.format()); + return true; + } else { + return false; + } + } + } +} \ No newline at end of file diff --git a/java/src/org/broadinstitute/sting/gatk/filters/UnmappedReadFilter.java b/java/src/org/broadinstitute/sting/gatk/filters/UnmappedReadFilter.java new file mode 100755 index 000000000..80cbb7024 --- /dev/null +++ b/java/src/org/broadinstitute/sting/gatk/filters/UnmappedReadFilter.java @@ -0,0 +1,43 @@ +/* + * Copyright (c) 2010, The Broad Institute + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, + * copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following + * conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES + * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT + * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + */ + +package org.broadinstitute.sting.gatk.filters; + +import net.sf.picard.filter.SamRecordFilter; +import net.sf.samtools.SAMRecord; +import org.broadinstitute.sting.gatk.traversals.TraversalStatistics; + +/** + * Created by IntelliJ IDEA. + * User: rpoplin + * Date: Dec 9, 2009 + * + * Filter out duplicate reads. + */ + +public class UnmappedReadFilter implements SamRecordFilter { + public boolean filterOut( final SAMRecord read ) { + return read.getReadUnmappedFlag() || read.getAlignmentStart() == SAMRecord.NO_ALIGNMENT_START; + } +} \ No newline at end of file diff --git a/java/src/org/broadinstitute/sting/gatk/traversals/TraversalEngine.java b/java/src/org/broadinstitute/sting/gatk/traversals/TraversalEngine.java index e31f7d3d5..b84f94d5b 100755 --- a/java/src/org/broadinstitute/sting/gatk/traversals/TraversalEngine.java +++ b/java/src/org/broadinstitute/sting/gatk/traversals/TraversalEngine.java @@ -3,7 +3,17 @@ package org.broadinstitute.sting.gatk.traversals; import org.apache.log4j.Logger; import org.broadinstitute.sting.gatk.datasources.providers.ShardDataProvider; import org.broadinstitute.sting.gatk.walkers.Walker; +import org.broadinstitute.sting.gatk.filters.CountingFilteringIterator; import org.broadinstitute.sting.utils.GenomeLoc; +import org.broadinstitute.sting.utils.Utils; +import org.broadinstitute.sting.utils.MathUtils; + +import java.util.Map; +import java.util.List; +import java.util.Iterator; + +import net.sf.picard.filter.SamRecordFilter; +import net.sf.samtools.SAMRecord; public abstract class TraversalEngine,ProviderType extends ShardDataProvider> { // Time in milliseconds since we initialized this engine @@ -88,16 +98,22 @@ public abstract class TraversalEngine,Provide printProgress(true, type, null); final long curTime = System.currentTimeMillis(); final double elapsed = (curTime - startTime) / 1000.0; + + // count up the number of skipped reads by summing over all filters + long nSkippedReads = 0L; + for ( long counts : TraversalStatistics.counter.values() ) + nSkippedReads += counts; + logger.info(String.format("Total runtime %.2f secs, %.2f min, %.2f hours%n", elapsed, elapsed / 60, elapsed / 3600)); - logger.info(String.format("Traversal skipped %d valid reads out of %d total (%.2f%%)", - TraversalStatistics.nSkippedReads, + logger.info(String.format("%d reads were filtered out during traversal out of %d total (%.2f%%)", + nSkippedReads, TraversalStatistics.nReads, - (TraversalStatistics.nSkippedReads * 100.0) / TraversalStatistics.nReads)); - logger.info(String.format(" -> %d unmapped reads", TraversalStatistics.nUnmappedReads)); - logger.info(String.format(" -> %d duplicate reads", TraversalStatistics.nDuplicates)); - logger.info(String.format(" -> %d reads with non-primary alignments", TraversalStatistics.nNotPrimary)); - logger.info(String.format(" -> %d reads with bad alignments", TraversalStatistics.nBadAlignments)); - logger.info(String.format(" -> %d reads with indels", TraversalStatistics.nSkippedIndels)); + 100.0 * MathUtils.ratio(nSkippedReads, TraversalStatistics.nReads))); + for ( Map.Entry filterCounts : TraversalStatistics.counter.entrySet() ) { + long count = filterCounts.getValue(); + logger.info(String.format(" -> %d reads (%.2f%% of total) failing %s", + count, 100.0 * MathUtils.ratio(count, TraversalStatistics.nReads), Utils.getClassName(filterCounts.getKey()))); + } } /** Initialize the traversal engine. After this point traversals can be run over the data */ @@ -117,4 +133,15 @@ public abstract class TraversalEngine,Provide public abstract T traverse(WalkerType walker, ProviderType dataProvider, T sum); + + public static Iterator addMandatoryFilteringIterators(Iterator iter, List filters ) { + for( SamRecordFilter filter : filters ) { + //logger.debug("Adding filter " + filter.getClass()); + iter = new CountingFilteringIterator(iter,filter); + } + + return new CountingFilteringIterator(iter); // special case to count all reads + } + + } diff --git a/java/src/org/broadinstitute/sting/gatk/traversals/TraversalStatistics.java b/java/src/org/broadinstitute/sting/gatk/traversals/TraversalStatistics.java index ee8a3e684..c49bd2f19 100755 --- a/java/src/org/broadinstitute/sting/gatk/traversals/TraversalStatistics.java +++ b/java/src/org/broadinstitute/sting/gatk/traversals/TraversalStatistics.java @@ -1,5 +1,12 @@ package org.broadinstitute.sting.gatk.traversals; +import net.sf.picard.filter.SamRecordFilter; + +import java.util.Map; +import java.util.HashMap; + +import org.broadinstitute.sting.utils.Utils; + /** * Created by IntelliJ IDEA. * User: hanna @@ -20,11 +27,21 @@ public class TraversalStatistics { public static long nBadAlignments; public static long nSkippedIndels; public static long nDuplicates; + public static Map counter = new HashMap(); static { reset(); } + public static void incrementFilter(SamRecordFilter filter) { + long c = 0; + if ( counter.containsKey(filter.getClass()) ) { + c = counter.get(filter.getClass()); + } + + counter.put(filter.getClass(), c + 1L); + } + public static void reset() { nRecords = 0; nReads = 0; diff --git a/java/src/org/broadinstitute/sting/gatk/traversals/TraverseDuplicates.java b/java/src/org/broadinstitute/sting/gatk/traversals/TraverseDuplicates.java index 1075cb30a..4f16a3b3d 100755 --- a/java/src/org/broadinstitute/sting/gatk/traversals/TraverseDuplicates.java +++ b/java/src/org/broadinstitute/sting/gatk/traversals/TraverseDuplicates.java @@ -25,9 +25,9 @@ package org.broadinstitute.sting.gatk.traversals; -import net.sf.picard.filter.FilteringIterator; import net.sf.picard.filter.SamRecordFilter; import net.sf.samtools.SAMRecord; +import net.sf.samtools.util.CloseableIterator; import org.apache.log4j.Logger; import org.broadinstitute.sting.gatk.contexts.AlignmentContext; import org.broadinstitute.sting.gatk.datasources.providers.ReadView; @@ -148,37 +148,6 @@ public class TraverseDuplicates extends TraversalEngine %b", rec.getReadName(), result); - } else { - TraversalStatistics.nReads++; - } - return result; - } - } - // -------------------------------------------------------------------------------------------------------------- // // new style interface to the system @@ -196,7 +165,7 @@ public class TraverseDuplicates extends TraversalEngine walker, ReadShardDataProvider dataProvider, T sum) { - FilteringIterator filterIter = new FilteringIterator(new ReadView(dataProvider).iterator(), new duplicateStreamFilterFunc()); + Iterator filterIter = addMandatoryFilteringIterators(new ReadView(dataProvider).iterator(), walker.getMandatoryReadFilters()); PushbackIterator iter = new PushbackIterator(filterIter); /** diff --git a/java/src/org/broadinstitute/sting/gatk/walkers/DuplicateWalker.java b/java/src/org/broadinstitute/sting/gatk/walkers/DuplicateWalker.java index 61dda61bd..d9a7a7121 100755 --- a/java/src/org/broadinstitute/sting/gatk/walkers/DuplicateWalker.java +++ b/java/src/org/broadinstitute/sting/gatk/walkers/DuplicateWalker.java @@ -1,12 +1,19 @@ package org.broadinstitute.sting.gatk.walkers; import org.broadinstitute.sting.gatk.contexts.AlignmentContext; +import org.broadinstitute.sting.gatk.traversals.TraversalStatistics; +import org.broadinstitute.sting.gatk.filters.UnmappedReadFilter; +import org.broadinstitute.sting.gatk.filters.NotPrimaryAlignmentReadFilter; +import org.broadinstitute.sting.gatk.filters.DuplicateReadFilter; import org.broadinstitute.sting.utils.GenomeLoc; import java.util.List; import java.util.Set; +import java.util.ArrayList; +import java.util.Arrays; import net.sf.samtools.SAMRecord; +import net.sf.picard.filter.SamRecordFilter; /** * Created by IntelliJ IDEA. @@ -27,4 +34,20 @@ public abstract class DuplicateWalker extends Walker getMandatoryReadFilters() { + SamRecordFilter filter1 = new UnmappedReadFilter(); + SamRecordFilter filter2 = new NotPrimaryAlignmentReadFilter(); + List x = super.getMandatoryReadFilters(); + + x.addAll(Arrays.asList(filter2, filter1)); + return x; + + } } \ No newline at end of file diff --git a/java/src/org/broadinstitute/sting/gatk/walkers/LocusWalker.java b/java/src/org/broadinstitute/sting/gatk/walkers/LocusWalker.java index f066da13e..d8d5adefb 100755 --- a/java/src/org/broadinstitute/sting/gatk/walkers/LocusWalker.java +++ b/java/src/org/broadinstitute/sting/gatk/walkers/LocusWalker.java @@ -3,6 +3,15 @@ package org.broadinstitute.sting.gatk.walkers; import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; import org.broadinstitute.sting.gatk.contexts.AlignmentContext; import org.broadinstitute.sting.gatk.contexts.ReferenceContext; +import org.broadinstitute.sting.gatk.traversals.TraversalStatistics; +import org.broadinstitute.sting.gatk.filters.UnmappedReadFilter; +import org.broadinstitute.sting.gatk.filters.NotPrimaryAlignmentReadFilter; +import org.broadinstitute.sting.gatk.filters.DuplicateReadFilter; +import net.sf.picard.filter.SamRecordFilter; +import net.sf.samtools.SAMRecord; + +import java.util.List; +import java.util.Arrays; /** * Created by IntelliJ IDEA. @@ -21,4 +30,64 @@ public abstract class LocusWalker extends Walker getMandatoryReadFilters() { +// if ( false ) { +// SamRecordFilter filter = new LocusStreamFilterFunc(); +// return Arrays.asList(filter); +// } else { + SamRecordFilter filter1 = new UnmappedReadFilter(); + SamRecordFilter filter2 = new NotPrimaryAlignmentReadFilter(); + SamRecordFilter filter3 = new DuplicateReadFilter(); + List x = super.getMandatoryReadFilters(); + x.addAll(Arrays.asList(filter3, filter2, filter1)); +// } + return x; + } + + /** + * Class to filter out un-handle-able reads from the stream. We currently are skipping + * unmapped reads, non-primary reads, unaligned reads, and duplicate reads. + */ + private static class LocusStreamFilterFunc implements SamRecordFilter { + SAMRecord lastRead = null; + public boolean filterOut(SAMRecord rec) { + boolean result = false; + if (rec.getReadUnmappedFlag()) { + TraversalStatistics.nUnmappedReads++; + result = true; + //why = "Unmapped"; + } else if (rec.getNotPrimaryAlignmentFlag()) { + TraversalStatistics.nNotPrimary++; + result = true; +// why = "Not Primary"; + } else if (rec.getAlignmentStart() == SAMRecord.NO_ALIGNMENT_START) { + TraversalStatistics.nBadAlignments++; + result = true; +// why = "No alignment start"; + } else if (rec.getDuplicateReadFlag()) { + TraversalStatistics.nDuplicates++; + result = true; +// why = "Duplicate reads"; + } + else { + result = false; + } + + if (result) { + TraversalStatistics.nSkippedReads++; + //System.out.printf(" [filter] %s => %b %s", rec.getReadName(), result, why); + } else { + TraversalStatistics.nReads++; + } + + return result; + } + } } diff --git a/java/src/org/broadinstitute/sting/gatk/walkers/Walker.java b/java/src/org/broadinstitute/sting/gatk/walkers/Walker.java index e2650ecae..9c9b07c20 100755 --- a/java/src/org/broadinstitute/sting/gatk/walkers/Walker.java +++ b/java/src/org/broadinstitute/sting/gatk/walkers/Walker.java @@ -27,11 +27,13 @@ package org.broadinstitute.sting.gatk.walkers; import java.io.PrintStream; import java.util.List; +import java.util.ArrayList; import org.broadinstitute.sting.gatk.GenomeAnalysisEngine; import org.broadinstitute.sting.utils.GenomeLoc; import org.broadinstitute.sting.utils.collections.Pair; import org.apache.log4j.Logger; +import net.sf.picard.filter.SamRecordFilter; /** * Created by IntelliJ IDEA. @@ -133,6 +135,15 @@ public abstract class Walker { out.println("[REDUCE RESULT] Traversal result is: " + result); } + + /** + * Returns a list of SamRecordFilters that *must* be applied to the read stream for the traversal to work + * @return a list of SamRecordFilters to apply in order + */ + public List getMandatoryReadFilters() { + return new ArrayList(); // by default + } + /** * General interval reduce routine called after all of the traversals are done * @param results diff --git a/java/src/org/broadinstitute/sting/utils/Utils.java b/java/src/org/broadinstitute/sting/utils/Utils.java index 9780acc9f..2f65f9cee 100755 --- a/java/src/org/broadinstitute/sting/utils/Utils.java +++ b/java/src/org/broadinstitute/sting/utils/Utils.java @@ -49,6 +49,31 @@ public class Utils { /** our log, which we want to capture anything from this class */ private static Logger logger = Logger.getLogger(Utils.class); + public static String getClassName(Class c) { + String FQClassName = c.getName(); + int firstChar; + firstChar = FQClassName.lastIndexOf ('.') + 1; + if ( firstChar > 0 ) { + FQClassName = FQClassName.substring ( firstChar ); + } + return FQClassName; + } + + + // returns package and class name + public static String getFullClassName(Class c) { + return c.getName(); + } + + // returns the package without the classname, empty string if + // there is no package + public static String getPackageName(Class c) { + String fullyQualifiedName = c.getName(); + int lastDot = fullyQualifiedName.lastIndexOf ('.'); + if (lastDot==-1){ return ""; } + return fullyQualifiedName.substring (0, lastDot); + } + public static void warnUser(final String msg) { logger.warn(String.format("********************************************************************************")); logger.warn(String.format("* WARNING:"));