Restructuring of the mandatory read filters for traversals. Now everything uses ReadFilters, even for the required filters like being mapped for LocusWalkers. Statistics now tracked for each read filter used during the traversal and info emitted in INFO at the end.
git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@3445 348d0f76-0448-11de-a6fe-93d51630548a
This commit is contained in:
parent
3873dccb35
commit
dfc36c1e95
|
|
@ -1,20 +1,15 @@
|
|||
package org.broadinstitute.sting.gatk.datasources.providers;
|
||||
|
||||
import net.sf.picard.filter.FilteringIterator;
|
||||
import net.sf.picard.filter.SamRecordFilter;
|
||||
import net.sf.samtools.SAMRecord;
|
||||
import org.broadinstitute.sting.gatk.Reads;
|
||||
import org.broadinstitute.sting.gatk.DownsampleType;
|
||||
import org.broadinstitute.sting.gatk.contexts.AlignmentContext;
|
||||
import org.broadinstitute.sting.gatk.iterators.LocusIterator;
|
||||
import org.broadinstitute.sting.gatk.iterators.LocusIteratorByState;
|
||||
import org.broadinstitute.sting.gatk.traversals.TraversalStatistics;
|
||||
import org.broadinstitute.sting.utils.GenomeLoc;
|
||||
|
||||
import java.util.Arrays;
|
||||
import java.util.Collection;
|
||||
import java.util.Iterator;
|
||||
import java.util.NoSuchElementException;
|
||||
|
||||
/**
|
||||
* User: hanna
|
||||
* Date: May 13, 2009
|
||||
|
|
|
|||
|
|
@ -255,7 +255,7 @@ public class BlockDrivenSAMDataSource extends SAMDataSource {
|
|||
continue;
|
||||
CloseableIterator<SAMRecord> iterator = readers.getReader(id).iterator(shard.getFileSpans().get(id));
|
||||
if(shard.getFilter() != null)
|
||||
iterator = new FilteringIterator(iterator,shard.getFilter());
|
||||
iterator = new FilteringIterator(iterator,shard.getFilter()); // not a counting iterator because we don't want to show the filtering of reads
|
||||
mergingIterator.addIterator(readers.getReader(id),iterator);
|
||||
}
|
||||
|
||||
|
|
|
|||
|
|
@ -2,13 +2,13 @@ package org.broadinstitute.sting.gatk.datasources.simpleDataSources;
|
|||
|
||||
import net.sf.samtools.SAMFileHeader;
|
||||
import net.sf.samtools.SAMRecord;
|
||||
import net.sf.picard.filter.FilteringIterator;
|
||||
import net.sf.picard.filter.SamRecordFilter;
|
||||
|
||||
import org.apache.log4j.Logger;
|
||||
import org.broadinstitute.sting.gatk.datasources.shards.Shard;
|
||||
import org.broadinstitute.sting.gatk.iterators.*;
|
||||
import org.broadinstitute.sting.gatk.Reads;
|
||||
import org.broadinstitute.sting.gatk.filters.CountingFilteringIterator;
|
||||
import org.broadinstitute.sting.utils.sam.SAMReadViolationHistogram;
|
||||
|
||||
import java.io.File;
|
||||
|
|
@ -209,7 +209,7 @@ public abstract class SAMDataSource implements SimpleDataSource {
|
|||
|
||||
for( SamRecordFilter supplementalFilter: supplementalFilters )
|
||||
wrappedIterator = StingSAMIteratorAdapter.adapt(wrappedIterator.getSourceInfo(),
|
||||
new FilteringIterator(wrappedIterator,supplementalFilter));
|
||||
new CountingFilteringIterator(wrappedIterator,supplementalFilter));
|
||||
|
||||
return wrappedIterator;
|
||||
}
|
||||
|
|
|
|||
|
|
@ -11,14 +11,14 @@ import org.broadinstitute.sting.gatk.walkers.Walker;
|
|||
import org.broadinstitute.sting.gatk.io.DirectOutputTracker;
|
||||
import org.broadinstitute.sting.gatk.io.OutputTracker;
|
||||
import org.broadinstitute.sting.gatk.GenomeAnalysisEngine;
|
||||
import org.broadinstitute.sting.gatk.iterators.LocusIteratorByState;
|
||||
import org.broadinstitute.sting.gatk.iterators.LocusIterator;
|
||||
import org.broadinstitute.sting.gatk.iterators.StingSAMIterator;
|
||||
import org.broadinstitute.sting.gatk.traversals.TraversalEngine;
|
||||
import org.broadinstitute.sting.utils.fasta.IndexedFastaSequenceFile;
|
||||
|
||||
import java.util.Collection;
|
||||
|
||||
import net.sf.picard.filter.FilteringIterator;
|
||||
import net.sf.samtools.SAMRecord;
|
||||
import net.sf.samtools.util.CloseableIterator;
|
||||
|
||||
|
||||
/** A micro-scheduling manager for single-threaded execution of a traversal. */
|
||||
public class LinearMicroScheduler extends MicroScheduler {
|
||||
|
|
@ -57,7 +57,7 @@ public class LinearMicroScheduler extends MicroScheduler {
|
|||
for (Shard shard : shardStrategy) {
|
||||
// New experimental code for managing locus intervals.
|
||||
if(shard.getShardType() == Shard.ShardType.LOCUS || shard.getShardType() == Shard.ShardType.LOCUS_INTERVAL) {
|
||||
WindowMaker windowMaker = new WindowMaker(getReadIterator(shard),shard.getGenomeLocs());
|
||||
WindowMaker windowMaker = new WindowMaker(getReadIterator(shard), shard.getGenomeLocs(), walker.getMandatoryReadFilters());
|
||||
for(WindowMaker.WindowMakerIterator iterator: windowMaker) {
|
||||
ShardDataProvider dataProvider = new LocusShardDataProvider(shard,iterator.getSourceInfo(),iterator.getLocus(),iterator,reference,rods);
|
||||
Object result = traversalEngine.traverse(walker, dataProvider, accumulator.getReduceInit());
|
||||
|
|
|
|||
|
|
@ -4,7 +4,9 @@ import org.broadinstitute.sting.utils.GenomeLoc;
|
|||
import org.broadinstitute.sting.gatk.iterators.*;
|
||||
import org.broadinstitute.sting.gatk.Reads;
|
||||
import org.broadinstitute.sting.gatk.DownsampleType;
|
||||
import org.broadinstitute.sting.gatk.filters.CountingFilteringIterator;
|
||||
import org.broadinstitute.sting.gatk.traversals.TraversalStatistics;
|
||||
import org.broadinstitute.sting.gatk.traversals.TraversalEngine;
|
||||
import org.broadinstitute.sting.gatk.contexts.AlignmentContext;
|
||||
|
||||
import java.util.*;
|
||||
|
|
@ -53,22 +55,27 @@ public class WindowMaker implements Iterable<WindowMaker.WindowMakerIterator>, I
|
|||
*/
|
||||
private boolean shardGenerated = false;
|
||||
|
||||
public WindowMaker(StingSAMIterator iterator, List<GenomeLoc> intervals) {
|
||||
this(iterator, intervals, new ArrayList<SamRecordFilter>());
|
||||
}
|
||||
|
||||
/**
|
||||
* Create a new window maker with the given iterator as a data source, covering
|
||||
* the given intervals.
|
||||
* @param iterator The data source for this window.
|
||||
* @param intervals The set of intervals over which to traverse.
|
||||
*/
|
||||
public WindowMaker(StingSAMIterator iterator, List<GenomeLoc> intervals) {
|
||||
public WindowMaker(StingSAMIterator iterator, List<GenomeLoc> intervals, List<SamRecordFilter> filters) {
|
||||
this.sourceInfo = iterator.getSourceInfo();
|
||||
this.readIterator = iterator;
|
||||
|
||||
LocusIterator locusIterator;
|
||||
Iterator<SAMRecord> wrappedIterator = TraversalEngine.addMandatoryFilteringIterators(iterator, filters);
|
||||
if(sourceInfo.getDownsamplingMethod() != null &&
|
||||
(sourceInfo.getDownsamplingMethod().type == DownsampleType.EXPERIMENTAL_BY_SAMPLE || sourceInfo.getDownsamplingMethod().type == DownsampleType.EXPERIMENTAL_NAIVE_DUPLICATE_ELIMINATOR))
|
||||
locusIterator = new DownsamplingLocusIteratorByState(new FilteringIterator(iterator,new LocusStreamFilterFunc()),sourceInfo);
|
||||
locusIterator = new DownsamplingLocusIteratorByState(wrappedIterator,sourceInfo);
|
||||
else
|
||||
locusIterator = new LocusIteratorByState(new FilteringIterator(iterator,new LocusStreamFilterFunc()),sourceInfo);
|
||||
locusIterator = new LocusIteratorByState(wrappedIterator,sourceInfo);
|
||||
|
||||
this.locusOverflowTracker = locusIterator.getLocusOverflowTracker();
|
||||
|
||||
|
|
@ -144,45 +151,4 @@ public class WindowMaker implements Iterable<WindowMaker.WindowMakerIterator>, I
|
|||
sourceIterator.next();
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Class to filter out un-handle-able reads from the stream. We currently are skipping
|
||||
* unmapped reads, non-primary reads, unaligned reads, and duplicate reads.
|
||||
*/
|
||||
private static class LocusStreamFilterFunc implements SamRecordFilter {
|
||||
SAMRecord lastRead = null;
|
||||
public boolean filterOut(SAMRecord rec) {
|
||||
boolean result = false;
|
||||
String why = "";
|
||||
if (rec.getReadUnmappedFlag()) {
|
||||
TraversalStatistics.nUnmappedReads++;
|
||||
result = true;
|
||||
why = "Unmapped";
|
||||
} else if (rec.getNotPrimaryAlignmentFlag()) {
|
||||
TraversalStatistics.nNotPrimary++;
|
||||
result = true;
|
||||
why = "Not Primary";
|
||||
} else if (rec.getAlignmentStart() == SAMRecord.NO_ALIGNMENT_START) {
|
||||
TraversalStatistics.nBadAlignments++;
|
||||
result = true;
|
||||
why = "No alignment start";
|
||||
} else if (rec.getDuplicateReadFlag()) {
|
||||
TraversalStatistics.nDuplicates++;
|
||||
result = true;
|
||||
why = "Duplicate reads";
|
||||
}
|
||||
else {
|
||||
result = false;
|
||||
}
|
||||
|
||||
if (result) {
|
||||
TraversalStatistics.nSkippedReads++;
|
||||
//System.out.printf(" [filter] %s => %b %s", rec.getReadName(), result, why);
|
||||
} else {
|
||||
TraversalStatistics.nReads++;
|
||||
}
|
||||
return result;
|
||||
}
|
||||
}
|
||||
|
||||
}
|
||||
|
|
|
|||
|
|
@ -0,0 +1,127 @@
|
|||
/*
|
||||
* Copyright (c) 2010, The Broad Institute
|
||||
*
|
||||
* Permission is hereby granted, free of charge, to any person
|
||||
* obtaining a copy of this software and associated documentation
|
||||
* files (the "Software"), to deal in the Software without
|
||||
* restriction, including without limitation the rights to use,
|
||||
* copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
* copies of the Software, and to permit persons to whom the
|
||||
* Software is furnished to do so, subject to the following
|
||||
* conditions:
|
||||
*
|
||||
* The above copyright notice and this permission notice shall be
|
||||
* included in all copies or substantial portions of the Software.
|
||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
||||
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
|
||||
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
||||
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
|
||||
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
|
||||
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
||||
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
|
||||
* OTHER DEALINGS IN THE SOFTWARE.
|
||||
*/
|
||||
package org.broadinstitute.sting.gatk.filters;
|
||||
|
||||
import net.sf.samtools.util.CloserUtil;
|
||||
|
||||
import java.util.Iterator;
|
||||
import java.util.NoSuchElementException;
|
||||
|
||||
import net.sf.samtools.SAMRecord;
|
||||
import net.sf.samtools.util.CloseableIterator;
|
||||
import net.sf.picard.filter.SamRecordFilter;
|
||||
import org.broadinstitute.sting.gatk.traversals.TraversalStatistics;
|
||||
|
||||
/**
|
||||
* Filtering Iterator which takes a filter and an iterator and iterates
|
||||
* through only those records which are not rejected by the filter.
|
||||
*
|
||||
* Just a copy of a unmodifiable FilteringIterator from Picard
|
||||
*
|
||||
* @author Mark DePristo
|
||||
*/
|
||||
public class CountingFilteringIterator implements CloseableIterator<SAMRecord> {
|
||||
private final Iterator<SAMRecord> iterator;
|
||||
private final SamRecordFilter filter;
|
||||
private SAMRecord next = null;
|
||||
|
||||
/**
|
||||
* Constructor
|
||||
*
|
||||
* @param iterator the backing iterator
|
||||
* @param filter the filter (which may be a FilterAggregator)
|
||||
*/
|
||||
public CountingFilteringIterator(Iterator<SAMRecord> iterator, SamRecordFilter filter) {
|
||||
this.iterator = iterator;
|
||||
this.filter = filter;
|
||||
next = getNextRecord();
|
||||
}
|
||||
|
||||
/**
|
||||
* Special case to count passing records
|
||||
* @param iterator
|
||||
*/
|
||||
public CountingFilteringIterator(Iterator<SAMRecord> iterator) {
|
||||
this(iterator, null);
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns true if the iteration has more elements.
|
||||
*
|
||||
* @return true if the iteration has more elements. Otherwise returns false.
|
||||
*/
|
||||
public boolean hasNext() {
|
||||
return next != null;
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns the next element in the iteration.
|
||||
*
|
||||
* @return the next element in the iteration
|
||||
* @throws java.util.NoSuchElementException
|
||||
*/
|
||||
public SAMRecord next() {
|
||||
if (next == null) {
|
||||
throw new NoSuchElementException("Iterator has no more elements.");
|
||||
}
|
||||
final SAMRecord result = next;
|
||||
next = getNextRecord();
|
||||
return result;
|
||||
}
|
||||
|
||||
/**
|
||||
* Required method for Iterator API.
|
||||
*
|
||||
* @throws UnsupportedOperationException
|
||||
*/
|
||||
public void remove() {
|
||||
throw new UnsupportedOperationException("Remove() not supported by CountingFilteringIterator");
|
||||
}
|
||||
|
||||
public void close() {
|
||||
CloserUtil.close(iterator);
|
||||
}
|
||||
|
||||
/**
|
||||
* Gets the next record from the underlying iterator that passes the filter
|
||||
*
|
||||
* @return SAMRecord the next filter-passing record
|
||||
*/
|
||||
private SAMRecord getNextRecord() {
|
||||
while (iterator.hasNext()) {
|
||||
SAMRecord record = iterator.next();
|
||||
|
||||
if ( filter == null ) {
|
||||
TraversalStatistics.nReads++;
|
||||
return record;
|
||||
} else if (!filter.filterOut(record)) {
|
||||
return record;
|
||||
} else {
|
||||
TraversalStatistics.incrementFilter(filter);
|
||||
}
|
||||
}
|
||||
|
||||
return null;
|
||||
}
|
||||
}
|
||||
|
|
@ -0,0 +1,43 @@
|
|||
/*
|
||||
* Copyright (c) 2010, The Broad Institute
|
||||
*
|
||||
* Permission is hereby granted, free of charge, to any person
|
||||
* obtaining a copy of this software and associated documentation
|
||||
* files (the "Software"), to deal in the Software without
|
||||
* restriction, including without limitation the rights to use,
|
||||
* copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
* copies of the Software, and to permit persons to whom the
|
||||
* Software is furnished to do so, subject to the following
|
||||
* conditions:
|
||||
*
|
||||
* The above copyright notice and this permission notice shall be
|
||||
* included in all copies or substantial portions of the Software.
|
||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
||||
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
|
||||
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
||||
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
|
||||
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
|
||||
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
||||
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
|
||||
* OTHER DEALINGS IN THE SOFTWARE.
|
||||
*/
|
||||
|
||||
package org.broadinstitute.sting.gatk.filters;
|
||||
|
||||
import net.sf.picard.filter.SamRecordFilter;
|
||||
import net.sf.samtools.SAMRecord;
|
||||
import org.broadinstitute.sting.gatk.traversals.TraversalStatistics;
|
||||
|
||||
/**
|
||||
* Created by IntelliJ IDEA.
|
||||
* User: rpoplin
|
||||
* Date: Dec 9, 2009
|
||||
*
|
||||
* Filter out duplicate reads.
|
||||
*/
|
||||
|
||||
public class NotPrimaryAlignmentReadFilter implements SamRecordFilter {
|
||||
public boolean filterOut( final SAMRecord read ) {
|
||||
return read.getNotPrimaryAlignmentFlag();
|
||||
}
|
||||
}
|
||||
|
|
@ -0,0 +1,84 @@
|
|||
/*
|
||||
* Copyright (c) 2010, The Broad Institute
|
||||
*
|
||||
* Permission is hereby granted, free of charge, to any person
|
||||
* obtaining a copy of this software and associated documentation
|
||||
* files (the "Software"), to deal in the Software without
|
||||
* restriction, including without limitation the rights to use,
|
||||
* copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
* copies of the Software, and to permit persons to whom the
|
||||
* Software is furnished to do so, subject to the following
|
||||
* conditions:
|
||||
*
|
||||
* The above copyright notice and this permission notice shall be
|
||||
* included in all copies or substantial portions of the Software.
|
||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
||||
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
|
||||
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
||||
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
|
||||
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
|
||||
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
||||
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
|
||||
* OTHER DEALINGS IN THE SOFTWARE.
|
||||
*/
|
||||
|
||||
package org.broadinstitute.sting.gatk.filters;
|
||||
|
||||
import net.sf.picard.filter.SamRecordFilter;
|
||||
import net.sf.samtools.SAMRecord;
|
||||
|
||||
/**
|
||||
* Filters out read pairs where the reads are so long relative to the over fragment size that they are
|
||||
* reading into each other's adaptors.
|
||||
*
|
||||
* Normally, fragments are sufficiently far apart that reads aren't reading into each other.
|
||||
*
|
||||
* |--------------------> first read
|
||||
* <--------------------| second read
|
||||
*
|
||||
* Sometimes, mostly due to lab errors or constraints, fragment library are made too short relative to the
|
||||
* length of the reads. For example, it's possible to have 76bp PE reads with 125 bp inserts, so that ~25 bp of each
|
||||
* read overlaps with its mate.
|
||||
*
|
||||
* |--------------------> first read
|
||||
* <--------------------| second read
|
||||
*
|
||||
* This filter deals with the situation where the fragment is so small that the each read actually reads into the
|
||||
* adaptor sequence of its mate, generating mismatches at both ends of the read:
|
||||
*
|
||||
* |----------------XXXX> first read
|
||||
* <XXXX----------------| second read
|
||||
*
|
||||
* So, how do we detect the problematic case? Since the first and second pair can land on chromosome in either order, we need
|
||||
* to consider not first and second, but rather left and right, defined by which is :
|
||||
*
|
||||
* |----------------XXXX> left read
|
||||
* <XXXX----------------| right read
|
||||
*
|
||||
*
|
||||
* The problematic situation occurs when the alignment stop of this read is greater the alignment start of its mate
|
||||
*
|
||||
* @author depristo
|
||||
* @version 0.1
|
||||
*/
|
||||
public class TinyFragmentFilter implements SamRecordFilter {
|
||||
public boolean filterOut(final SAMRecord rec) {
|
||||
long isize = rec.getInferredInsertSize();
|
||||
if ( isize == 0 )
|
||||
return false; // unmapped pair -- cannot filter out
|
||||
else {
|
||||
long start = rec.getAlignmentStart();
|
||||
long end = rec.getAlignmentEnd();
|
||||
long mateStart = rec.getMateAlignmentStart();
|
||||
long mateEnd = rec.getAlignmentStart() + isize;
|
||||
boolean bad = rec.getReadNegativeStrandFlag() ? start < mateStart : end > mateEnd;
|
||||
//System.out.printf("%s %d %d %d %d %d => %b%n", rec.getReadName(), start, end, mateStart, mateEnd, isize, bad);
|
||||
if ( bad ) {
|
||||
//System.out.printf("TinyFragment: " + rec.format());
|
||||
return true;
|
||||
} else {
|
||||
return false;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
@ -0,0 +1,43 @@
|
|||
/*
|
||||
* Copyright (c) 2010, The Broad Institute
|
||||
*
|
||||
* Permission is hereby granted, free of charge, to any person
|
||||
* obtaining a copy of this software and associated documentation
|
||||
* files (the "Software"), to deal in the Software without
|
||||
* restriction, including without limitation the rights to use,
|
||||
* copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
* copies of the Software, and to permit persons to whom the
|
||||
* Software is furnished to do so, subject to the following
|
||||
* conditions:
|
||||
*
|
||||
* The above copyright notice and this permission notice shall be
|
||||
* included in all copies or substantial portions of the Software.
|
||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
||||
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
|
||||
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
||||
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
|
||||
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
|
||||
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
||||
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
|
||||
* OTHER DEALINGS IN THE SOFTWARE.
|
||||
*/
|
||||
|
||||
package org.broadinstitute.sting.gatk.filters;
|
||||
|
||||
import net.sf.picard.filter.SamRecordFilter;
|
||||
import net.sf.samtools.SAMRecord;
|
||||
import org.broadinstitute.sting.gatk.traversals.TraversalStatistics;
|
||||
|
||||
/**
|
||||
* Created by IntelliJ IDEA.
|
||||
* User: rpoplin
|
||||
* Date: Dec 9, 2009
|
||||
*
|
||||
* Filter out duplicate reads.
|
||||
*/
|
||||
|
||||
public class UnmappedReadFilter implements SamRecordFilter {
|
||||
public boolean filterOut( final SAMRecord read ) {
|
||||
return read.getReadUnmappedFlag() || read.getAlignmentStart() == SAMRecord.NO_ALIGNMENT_START;
|
||||
}
|
||||
}
|
||||
|
|
@ -3,7 +3,17 @@ package org.broadinstitute.sting.gatk.traversals;
|
|||
import org.apache.log4j.Logger;
|
||||
import org.broadinstitute.sting.gatk.datasources.providers.ShardDataProvider;
|
||||
import org.broadinstitute.sting.gatk.walkers.Walker;
|
||||
import org.broadinstitute.sting.gatk.filters.CountingFilteringIterator;
|
||||
import org.broadinstitute.sting.utils.GenomeLoc;
|
||||
import org.broadinstitute.sting.utils.Utils;
|
||||
import org.broadinstitute.sting.utils.MathUtils;
|
||||
|
||||
import java.util.Map;
|
||||
import java.util.List;
|
||||
import java.util.Iterator;
|
||||
|
||||
import net.sf.picard.filter.SamRecordFilter;
|
||||
import net.sf.samtools.SAMRecord;
|
||||
|
||||
public abstract class TraversalEngine<M,T,WalkerType extends Walker<M,T>,ProviderType extends ShardDataProvider> {
|
||||
// Time in milliseconds since we initialized this engine
|
||||
|
|
@ -88,16 +98,22 @@ public abstract class TraversalEngine<M,T,WalkerType extends Walker<M,T>,Provide
|
|||
printProgress(true, type, null);
|
||||
final long curTime = System.currentTimeMillis();
|
||||
final double elapsed = (curTime - startTime) / 1000.0;
|
||||
|
||||
// count up the number of skipped reads by summing over all filters
|
||||
long nSkippedReads = 0L;
|
||||
for ( long counts : TraversalStatistics.counter.values() )
|
||||
nSkippedReads += counts;
|
||||
|
||||
logger.info(String.format("Total runtime %.2f secs, %.2f min, %.2f hours%n", elapsed, elapsed / 60, elapsed / 3600));
|
||||
logger.info(String.format("Traversal skipped %d valid reads out of %d total (%.2f%%)",
|
||||
TraversalStatistics.nSkippedReads,
|
||||
logger.info(String.format("%d reads were filtered out during traversal out of %d total (%.2f%%)",
|
||||
nSkippedReads,
|
||||
TraversalStatistics.nReads,
|
||||
(TraversalStatistics.nSkippedReads * 100.0) / TraversalStatistics.nReads));
|
||||
logger.info(String.format(" -> %d unmapped reads", TraversalStatistics.nUnmappedReads));
|
||||
logger.info(String.format(" -> %d duplicate reads", TraversalStatistics.nDuplicates));
|
||||
logger.info(String.format(" -> %d reads with non-primary alignments", TraversalStatistics.nNotPrimary));
|
||||
logger.info(String.format(" -> %d reads with bad alignments", TraversalStatistics.nBadAlignments));
|
||||
logger.info(String.format(" -> %d reads with indels", TraversalStatistics.nSkippedIndels));
|
||||
100.0 * MathUtils.ratio(nSkippedReads, TraversalStatistics.nReads)));
|
||||
for ( Map.Entry<Class, Long> filterCounts : TraversalStatistics.counter.entrySet() ) {
|
||||
long count = filterCounts.getValue();
|
||||
logger.info(String.format(" -> %d reads (%.2f%% of total) failing %s",
|
||||
count, 100.0 * MathUtils.ratio(count, TraversalStatistics.nReads), Utils.getClassName(filterCounts.getKey())));
|
||||
}
|
||||
}
|
||||
|
||||
/** Initialize the traversal engine. After this point traversals can be run over the data */
|
||||
|
|
@ -117,4 +133,15 @@ public abstract class TraversalEngine<M,T,WalkerType extends Walker<M,T>,Provide
|
|||
public abstract T traverse(WalkerType walker,
|
||||
ProviderType dataProvider,
|
||||
T sum);
|
||||
|
||||
public static Iterator<SAMRecord> addMandatoryFilteringIterators(Iterator<SAMRecord> iter, List<SamRecordFilter> filters ) {
|
||||
for( SamRecordFilter filter : filters ) {
|
||||
//logger.debug("Adding filter " + filter.getClass());
|
||||
iter = new CountingFilteringIterator(iter,filter);
|
||||
}
|
||||
|
||||
return new CountingFilteringIterator(iter); // special case to count all reads
|
||||
}
|
||||
|
||||
|
||||
}
|
||||
|
|
|
|||
|
|
@ -1,5 +1,12 @@
|
|||
package org.broadinstitute.sting.gatk.traversals;
|
||||
|
||||
import net.sf.picard.filter.SamRecordFilter;
|
||||
|
||||
import java.util.Map;
|
||||
import java.util.HashMap;
|
||||
|
||||
import org.broadinstitute.sting.utils.Utils;
|
||||
|
||||
/**
|
||||
* Created by IntelliJ IDEA.
|
||||
* User: hanna
|
||||
|
|
@ -20,11 +27,21 @@ public class TraversalStatistics {
|
|||
public static long nBadAlignments;
|
||||
public static long nSkippedIndels;
|
||||
public static long nDuplicates;
|
||||
public static Map<Class, Long> counter = new HashMap<Class, Long>();
|
||||
|
||||
static {
|
||||
reset();
|
||||
}
|
||||
|
||||
public static void incrementFilter(SamRecordFilter filter) {
|
||||
long c = 0;
|
||||
if ( counter.containsKey(filter.getClass()) ) {
|
||||
c = counter.get(filter.getClass());
|
||||
}
|
||||
|
||||
counter.put(filter.getClass(), c + 1L);
|
||||
}
|
||||
|
||||
public static void reset() {
|
||||
nRecords = 0;
|
||||
nReads = 0;
|
||||
|
|
|
|||
|
|
@ -25,9 +25,9 @@
|
|||
|
||||
package org.broadinstitute.sting.gatk.traversals;
|
||||
|
||||
import net.sf.picard.filter.FilteringIterator;
|
||||
import net.sf.picard.filter.SamRecordFilter;
|
||||
import net.sf.samtools.SAMRecord;
|
||||
import net.sf.samtools.util.CloseableIterator;
|
||||
import org.apache.log4j.Logger;
|
||||
import org.broadinstitute.sting.gatk.contexts.AlignmentContext;
|
||||
import org.broadinstitute.sting.gatk.datasources.providers.ReadView;
|
||||
|
|
@ -148,37 +148,6 @@ public class TraverseDuplicates<M,T> extends TraversalEngine<M,T,DuplicateWalker
|
|||
return null;
|
||||
}
|
||||
|
||||
/**
|
||||
* Class to filter out un-handle-able reads from the stream. We currently are skipping
|
||||
* unmapped reads, non-primary reads, unaligned reads, and duplicate reads.
|
||||
*/
|
||||
public static class duplicateStreamFilterFunc implements SamRecordFilter {
|
||||
SAMRecord lastRead = null;
|
||||
public boolean filterOut(SAMRecord rec) {
|
||||
boolean result = false;
|
||||
if (rec.getReadUnmappedFlag()) {
|
||||
TraversalStatistics.nUnmappedReads++;
|
||||
result = true;
|
||||
} else if (rec.getNotPrimaryAlignmentFlag()) {
|
||||
TraversalStatistics.nNotPrimary++;
|
||||
result = true;
|
||||
} else if (rec.getAlignmentStart() == SAMRecord.NO_ALIGNMENT_START) {
|
||||
TraversalStatistics.nBadAlignments++;
|
||||
result = true;
|
||||
} else {
|
||||
result = false;
|
||||
}
|
||||
|
||||
if (result) {
|
||||
TraversalStatistics.nSkippedReads++;
|
||||
//System.out.printf(" [filter] %s => %b", rec.getReadName(), result);
|
||||
} else {
|
||||
TraversalStatistics.nReads++;
|
||||
}
|
||||
return result;
|
||||
}
|
||||
}
|
||||
|
||||
// --------------------------------------------------------------------------------------------------------------
|
||||
//
|
||||
// new style interface to the system
|
||||
|
|
@ -196,7 +165,7 @@ public class TraverseDuplicates<M,T> extends TraversalEngine<M,T,DuplicateWalker
|
|||
public T traverse(DuplicateWalker<M, T> walker,
|
||||
ReadShardDataProvider dataProvider,
|
||||
T sum) {
|
||||
FilteringIterator filterIter = new FilteringIterator(new ReadView(dataProvider).iterator(), new duplicateStreamFilterFunc());
|
||||
Iterator<SAMRecord> filterIter = addMandatoryFilteringIterators(new ReadView(dataProvider).iterator(), walker.getMandatoryReadFilters());
|
||||
PushbackIterator<SAMRecord> iter = new PushbackIterator<SAMRecord>(filterIter);
|
||||
|
||||
/**
|
||||
|
|
|
|||
|
|
@ -1,12 +1,19 @@
|
|||
package org.broadinstitute.sting.gatk.walkers;
|
||||
|
||||
import org.broadinstitute.sting.gatk.contexts.AlignmentContext;
|
||||
import org.broadinstitute.sting.gatk.traversals.TraversalStatistics;
|
||||
import org.broadinstitute.sting.gatk.filters.UnmappedReadFilter;
|
||||
import org.broadinstitute.sting.gatk.filters.NotPrimaryAlignmentReadFilter;
|
||||
import org.broadinstitute.sting.gatk.filters.DuplicateReadFilter;
|
||||
import org.broadinstitute.sting.utils.GenomeLoc;
|
||||
|
||||
import java.util.List;
|
||||
import java.util.Set;
|
||||
import java.util.ArrayList;
|
||||
import java.util.Arrays;
|
||||
|
||||
import net.sf.samtools.SAMRecord;
|
||||
import net.sf.picard.filter.SamRecordFilter;
|
||||
|
||||
/**
|
||||
* Created by IntelliJ IDEA.
|
||||
|
|
@ -27,4 +34,20 @@ public abstract class DuplicateWalker<MapType, ReduceType> extends Walker<MapTyp
|
|||
// Given result of map function
|
||||
public abstract ReduceType reduceInit();
|
||||
public abstract ReduceType reduce(MapType value, ReduceType sum);
|
||||
|
||||
// --------------------------------------------------------------------------------------------------------------
|
||||
//
|
||||
// read filters
|
||||
//
|
||||
// --------------------------------------------------------------------------------------------------------------
|
||||
|
||||
public List<SamRecordFilter> getMandatoryReadFilters() {
|
||||
SamRecordFilter filter1 = new UnmappedReadFilter();
|
||||
SamRecordFilter filter2 = new NotPrimaryAlignmentReadFilter();
|
||||
List<SamRecordFilter> x = super.getMandatoryReadFilters();
|
||||
|
||||
x.addAll(Arrays.asList(filter2, filter1));
|
||||
return x;
|
||||
|
||||
}
|
||||
}
|
||||
|
|
@ -3,6 +3,15 @@ package org.broadinstitute.sting.gatk.walkers;
|
|||
import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker;
|
||||
import org.broadinstitute.sting.gatk.contexts.AlignmentContext;
|
||||
import org.broadinstitute.sting.gatk.contexts.ReferenceContext;
|
||||
import org.broadinstitute.sting.gatk.traversals.TraversalStatistics;
|
||||
import org.broadinstitute.sting.gatk.filters.UnmappedReadFilter;
|
||||
import org.broadinstitute.sting.gatk.filters.NotPrimaryAlignmentReadFilter;
|
||||
import org.broadinstitute.sting.gatk.filters.DuplicateReadFilter;
|
||||
import net.sf.picard.filter.SamRecordFilter;
|
||||
import net.sf.samtools.SAMRecord;
|
||||
|
||||
import java.util.List;
|
||||
import java.util.Arrays;
|
||||
|
||||
/**
|
||||
* Created by IntelliJ IDEA.
|
||||
|
|
@ -21,4 +30,64 @@ public abstract class LocusWalker<MapType, ReduceType> extends Walker<MapType, R
|
|||
|
||||
// Map over the org.broadinstitute.sting.gatk.contexts.AlignmentContext
|
||||
public abstract MapType map(RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context);
|
||||
|
||||
// --------------------------------------------------------------------------------------------------------------
|
||||
//
|
||||
// mandatory read filters
|
||||
//
|
||||
// --------------------------------------------------------------------------------------------------------------
|
||||
|
||||
public List<SamRecordFilter> getMandatoryReadFilters() {
|
||||
// if ( false ) {
|
||||
// SamRecordFilter filter = new LocusStreamFilterFunc();
|
||||
// return Arrays.asList(filter);
|
||||
// } else {
|
||||
SamRecordFilter filter1 = new UnmappedReadFilter();
|
||||
SamRecordFilter filter2 = new NotPrimaryAlignmentReadFilter();
|
||||
SamRecordFilter filter3 = new DuplicateReadFilter();
|
||||
List<SamRecordFilter> x = super.getMandatoryReadFilters();
|
||||
x.addAll(Arrays.asList(filter3, filter2, filter1));
|
||||
// }
|
||||
return x;
|
||||
}
|
||||
|
||||
/**
|
||||
* Class to filter out un-handle-able reads from the stream. We currently are skipping
|
||||
* unmapped reads, non-primary reads, unaligned reads, and duplicate reads.
|
||||
*/
|
||||
private static class LocusStreamFilterFunc implements SamRecordFilter {
|
||||
SAMRecord lastRead = null;
|
||||
public boolean filterOut(SAMRecord rec) {
|
||||
boolean result = false;
|
||||
if (rec.getReadUnmappedFlag()) {
|
||||
TraversalStatistics.nUnmappedReads++;
|
||||
result = true;
|
||||
//why = "Unmapped";
|
||||
} else if (rec.getNotPrimaryAlignmentFlag()) {
|
||||
TraversalStatistics.nNotPrimary++;
|
||||
result = true;
|
||||
// why = "Not Primary";
|
||||
} else if (rec.getAlignmentStart() == SAMRecord.NO_ALIGNMENT_START) {
|
||||
TraversalStatistics.nBadAlignments++;
|
||||
result = true;
|
||||
// why = "No alignment start";
|
||||
} else if (rec.getDuplicateReadFlag()) {
|
||||
TraversalStatistics.nDuplicates++;
|
||||
result = true;
|
||||
// why = "Duplicate reads";
|
||||
}
|
||||
else {
|
||||
result = false;
|
||||
}
|
||||
|
||||
if (result) {
|
||||
TraversalStatistics.nSkippedReads++;
|
||||
//System.out.printf(" [filter] %s => %b %s", rec.getReadName(), result, why);
|
||||
} else {
|
||||
TraversalStatistics.nReads++;
|
||||
}
|
||||
|
||||
return result;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
|
|||
|
|
@ -27,11 +27,13 @@ package org.broadinstitute.sting.gatk.walkers;
|
|||
|
||||
import java.io.PrintStream;
|
||||
import java.util.List;
|
||||
import java.util.ArrayList;
|
||||
|
||||
import org.broadinstitute.sting.gatk.GenomeAnalysisEngine;
|
||||
import org.broadinstitute.sting.utils.GenomeLoc;
|
||||
import org.broadinstitute.sting.utils.collections.Pair;
|
||||
import org.apache.log4j.Logger;
|
||||
import net.sf.picard.filter.SamRecordFilter;
|
||||
|
||||
/**
|
||||
* Created by IntelliJ IDEA.
|
||||
|
|
@ -133,6 +135,15 @@ public abstract class Walker<MapType, ReduceType> {
|
|||
out.println("[REDUCE RESULT] Traversal result is: " + result);
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* Returns a list of SamRecordFilters that *must* be applied to the read stream for the traversal to work
|
||||
* @return a list of SamRecordFilters to apply in order
|
||||
*/
|
||||
public List<SamRecordFilter> getMandatoryReadFilters() {
|
||||
return new ArrayList<SamRecordFilter>(); // by default
|
||||
}
|
||||
|
||||
/**
|
||||
* General interval reduce routine called after all of the traversals are done
|
||||
* @param results
|
||||
|
|
|
|||
|
|
@ -49,6 +49,31 @@ public class Utils {
|
|||
/** our log, which we want to capture anything from this class */
|
||||
private static Logger logger = Logger.getLogger(Utils.class);
|
||||
|
||||
public static String getClassName(Class c) {
|
||||
String FQClassName = c.getName();
|
||||
int firstChar;
|
||||
firstChar = FQClassName.lastIndexOf ('.') + 1;
|
||||
if ( firstChar > 0 ) {
|
||||
FQClassName = FQClassName.substring ( firstChar );
|
||||
}
|
||||
return FQClassName;
|
||||
}
|
||||
|
||||
|
||||
// returns package and class name
|
||||
public static String getFullClassName(Class c) {
|
||||
return c.getName();
|
||||
}
|
||||
|
||||
// returns the package without the classname, empty string if
|
||||
// there is no package
|
||||
public static String getPackageName(Class c) {
|
||||
String fullyQualifiedName = c.getName();
|
||||
int lastDot = fullyQualifiedName.lastIndexOf ('.');
|
||||
if (lastDot==-1){ return ""; }
|
||||
return fullyQualifiedName.substring (0, lastDot);
|
||||
}
|
||||
|
||||
public static void warnUser(final String msg) {
|
||||
logger.warn(String.format("********************************************************************************"));
|
||||
logger.warn(String.format("* WARNING:"));
|
||||
|
|
|
|||
Loading…
Reference in New Issue