Basic infrastructure for filtering malformed reads.

git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@1178 348d0f76-0448-11de-a6fe-93d51630548a
This commit is contained in:
hanna 2009-07-06 22:50:22 +00:00
parent b9d533042e
commit 5735c87581
11 changed files with 434 additions and 105 deletions

View File

@ -178,14 +178,6 @@ public abstract class LocusView extends LocusContextIterator implements View {
TraversalStatistics.nBadAlignments++;
result = true;
why = "No alignment start";
} else if (rec.getAlignmentEnd() != -1 && rec.getAlignmentEnd() < rec.getAlignmentStart() ) {
TraversalStatistics.nBadAlignments++;
result = true;
why = "Alignment ends before it starts";
} else if (rec.getAlignmentStart() != -1 && rec.getAlignmentBlocks().size() == 0) {
TraversalStatistics.nBadAlignments++;
result = true;
why = "Alignment cigar string is invalid";
} else if (rec.getDuplicateReadFlag()) {
TraversalStatistics.nDuplicates++;
result = true;

View File

@ -3,6 +3,8 @@ package org.broadinstitute.sting.gatk.datasources.simpleDataSources;
import net.sf.samtools.SAMFileHeader;
import net.sf.samtools.SAMRecord;
import net.sf.samtools.util.CloseableIterator;
import net.sf.picard.filter.FilteringIterator;
import net.sf.picard.filter.SamRecordFilter;
import org.apache.log4j.Logger;
import org.broadinstitute.sting.gatk.datasources.shards.ReadShard;
import org.broadinstitute.sting.gatk.datasources.shards.Shard;
@ -12,6 +14,8 @@ import org.broadinstitute.sting.gatk.traversals.TraversalEngine;
import org.broadinstitute.sting.utils.GenomeLoc;
import org.broadinstitute.sting.utils.StingException;
import org.broadinstitute.sting.utils.GenomeLocParser;
import org.broadinstitute.sting.utils.sam.SAMReadValidator;
import org.broadinstitute.sting.utils.sam.SAMReadViolationHistogram;
import java.io.File;
import java.util.List;
@ -54,6 +58,11 @@ public class SAMDataSource implements SimpleDataSource {
/** Backing support for reads. */
private final Reads reads;
/**
* A histogram of exactly what reads were removed from the input stream and why.
*/
private SAMReadViolationHistogram violations = new SAMReadViolationHistogram();
/** our log, which we want to capture anything from this class */
protected static Logger logger = Logger.getLogger(SAMDataSource.class);
@ -73,6 +82,14 @@ public class SAMDataSource implements SimpleDataSource {
// A pool of SAM iterators.
private SAMIteratorPool iteratorPool = null;
/**
* Returns a histogram of reads that were screened out, grouped by the nature of the error.
* @return Histogram of reads. Will not be null.
*/
public SAMReadViolationHistogram getViolationHistogram() {
return violations;
}
/**
* constructor, given sam files
*
@ -94,26 +111,12 @@ public class SAMDataSource implements SimpleDataSource {
}
/**
* For unit testing, add a custom iterator pool.
* Gets the (potentially merged) SAM file header.
*
* @param iteratorPool Custom mock iterator pool.
* @return SAM file header.
*/
void setResourcePool( SAMIteratorPool iteratorPool ) {
this.iteratorPool = iteratorPool;
}
/**
* <p>
* seekLocus
* </p>
*
* @param location the genome location to extract data for
*
* @return an iterator for that region
*/
public StingSAMIterator seekLocus( GenomeLoc location ) throws SimpleDataSourceLoadException {
return iteratorPool.iterator(new MappedStreamSegment(location));
public SAMFileHeader getHeader() {
return iteratorPool.getHeader();
}
/**
@ -130,14 +133,14 @@ public class SAMDataSource implements SimpleDataSource {
StingSAMIterator iterator = null;
if (shard.getShardType() == Shard.ShardType.READ) {
iterator = seekRead((ReadShard) shard);
iterator = TraversalEngine.applyDecoratingIterators(true,
iterator = applyDecoratingIterators(true,
iterator,
reads.getDownsamplingFraction(),
reads.getFilterZeroMappingQualityReads(),
reads.getSafetyChecking());
} else if (shard.getShardType() == Shard.ShardType.LOCUS || shard.getShardType() == Shard.ShardType.INTERVAL) {
iterator = seekLocus(shard.getGenomeLoc());
iterator = TraversalEngine.applyDecoratingIterators(false,
iterator = applyDecoratingIterators(false,
iterator,
reads.getDownsamplingFraction(),
reads.getFilterZeroMappingQualityReads(),
@ -151,12 +154,16 @@ public class SAMDataSource implements SimpleDataSource {
/**
* Gets the (potentially merged) SAM file header.
* <p>
* seekLocus
* </p>
*
* @return SAM file header.
* @param location the genome location to extract data for
*
* @return an iterator for that region
*/
public SAMFileHeader getHeader() {
return iteratorPool.getHeader();
private StingSAMIterator seekLocus( GenomeLoc location ) throws SimpleDataSourceLoadException {
return iteratorPool.iterator(new MappedStreamSegment(location));
}
@ -209,6 +216,15 @@ public class SAMDataSource implements SimpleDataSource {
includeUnmappedReads = seeUnMappedReads;
}
/**
* For unit testing, add a custom iterator pool.
*
* @param iteratorPool Custom mock iterator pool.
*/
void setResourcePool( SAMIteratorPool iteratorPool ) {
this.iteratorPool = iteratorPool;
}
/**
* Retrieve unmapped reads.
*
@ -333,6 +349,49 @@ public class SAMDataSource implements SimpleDataSource {
return bound;
}
/**
* Filter reads based on user-specified criteria.
*
* @param enableVerification Verify the order of reads.
* @param wrappedIterator the raw data source.
* @param downsamplingFraction whether and how much to downsample the reads themselves (not at a locus).
* @param filterZeroMappingQualityReads whether to filter zero mapping quality reads.
* @param beSafeP Another trigger for the verifying iterator? TODO: look into this.
* @return An iterator wrapped with filters reflecting the passed-in parameters. Will not be null.
*/
private StingSAMIterator applyDecoratingIterators(boolean enableVerification,
StingSAMIterator wrappedIterator,
Double downsamplingFraction,
Boolean filterZeroMappingQualityReads,
Boolean beSafeP) {
wrappedIterator = new MalformedSAMFilteringIterator(wrappedIterator,violations);
// NOTE: this (and other filtering) should be done before on-the-fly sorting
// as there is no reason to sort something that we will end of throwing away
if (downsamplingFraction != null)
wrappedIterator = new DownsampleIterator(wrappedIterator, downsamplingFraction);
if (beSafeP != null && beSafeP && enableVerification)
wrappedIterator = new VerifyingSamIterator(wrappedIterator);
if ( filterZeroMappingQualityReads != null && filterZeroMappingQualityReads )
wrappedIterator = StingSAMIteratorAdapter.adapt(wrappedIterator.getSourceInfo(),
new FilteringIterator(wrappedIterator, new ZeroMappingQualityReadFilterFunc()));
return wrappedIterator;
}
private static class ZeroMappingQualityReadFilterFunc implements SamRecordFilter {
public boolean filterOut(SAMRecord rec) {
if (rec.getMappingQuality() == 0) {
//System.out.printf("Filtering 0 mapping quality read %s%n", rec.format());
return true;
} else {
return false;
}
}
}
}
class SAMIteratorPool extends ResourcePool<ReadStreamPointer, StingSAMIterator> {

View File

@ -135,8 +135,9 @@ public class HierarchicalMicroScheduler extends MicroScheduler implements Hierar
throw new StingException("Unable to retrieve result", ex);
}
traversalEngine.printOnTraversalDone(result);
walker.onTraversalDone(result);
walker.onTraversalDone(result);
printOnTraversalDone(result);
return result;
}

View File

@ -49,7 +49,7 @@ public class LinearMicroScheduler extends MicroScheduler {
Object result = accumulator.finishTraversal();
traversalEngine.printOnTraversalDone(result);
printOnTraversalDone(result);
return accumulator;
}

View File

@ -207,6 +207,15 @@ public abstract class MicroScheduler {
return new ShardDataProvider(shard, reads, reference, rods);
}
/**
* Print summary information for the analysis.
* @param sum The final reduce output.
*/
protected void printOnTraversalDone(Object sum) {
logger.info(String.format("%n%s",reads.getViolationHistogram()));
traversalEngine.printOnTraversalDone(sum);
}
/**
* Gets a data source for the given set of reads.
*

View File

@ -0,0 +1,134 @@
/*
* Copyright (c) 2009 The Broad Institute
* Permission is hereby granted, free of charge, to any person
* obtaining a copy of this software and associated documentation
* files (the "Software"), to deal in the Software without
* restriction, including without limitation the rights to use,
* copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following
* conditions:
*
* The above copyright notice and this permission notice shall be
* included in all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
* OTHER DEALINGS IN THE SOFTWARE.
*/
package org.broadinstitute.sting.gatk.iterators;
import net.sf.samtools.SAMRecord;
import org.broadinstitute.sting.gatk.Reads;
import org.broadinstitute.sting.utils.sam.SAMReadValidator;
import org.broadinstitute.sting.utils.sam.SAMReadValidationException;
import org.broadinstitute.sting.utils.sam.SAMReadViolationHistogram;
import java.util.NoSuchElementException;
/**
* A decorating iterator that examines the stream of reads, discarding those
* that fail to meet a minimum standard for consumption by the GATK.
*
* @author hanna
* @version 0.1
*/
public class MalformedSAMFilteringIterator implements StingSAMIterator {
/**
* The wrapped iterator. Get reads from here.
*/
private StingSAMIterator wrapped = null;
/**
* Collector for SAM read violations.
*/
private SAMReadViolationHistogram violations = null;
/**
* The next SAMRecord to return.;
*/
private SAMRecord next = null;
/**
* Creates a new MalformedSAMFilteringIterator, and provides a collector for the count
* @param wrapped The wrapped iterator to use as backing data.
* @param violations A structure to hold a breakdown of validator violations.
*/
public MalformedSAMFilteringIterator( StingSAMIterator wrapped, SAMReadViolationHistogram violations ) {
this.wrapped = wrapped;
this.violations = violations;
seedNext();
}
/**
* Returns source information about the reads.
* @return
*/
public Reads getSourceInfo() {
return wrapped.getSourceInfo();
}
/**
* Gets an iterator, helpful for foreach loops.
* @return An iterator sharing the same state variables as the current iterator.
*/
public StingSAMIterator iterator() {
return this;
}
/**
* Checks to see whether there's a
* @return True if a next is available, false otherwise.
*/
public boolean hasNext() {
return next != null;
}
/**
* Gets the next valid record from the stream.
* @return Next valid record.
*/
public SAMRecord next() {
SAMRecord current = next;
if( current == null )
throw new NoSuchElementException("MalformedSAMFilteringIterator: supply of reads is exhausted.");
seedNext();
return current;
}
/**
* Closes the wrapped iterator.
*/
public void close() {
wrapped.close();
}
/**
* Looks ahead for the next valid SAMRecord.
*/
protected void seedNext() {
next = null;
while( wrapped.hasNext() && next == null ) {
SAMRecord toTest = wrapped.next();
try {
SAMReadValidator.validate(toTest);
next = toTest;
}
catch ( SAMReadValidationException ex ) {
violations.addViolation(ex);
}
}
}
/**
* Throws an exception. Remove is not supported.
*/
public void remove() { throw new UnsupportedOperationException("Unable to remove from a StingSAMIterator"); }
}

View File

@ -275,13 +275,10 @@ public abstract class TraversalEngine {
/**
* A passthrough method so that subclasses can report which types of traversals they're using.
* TODO: Make this method abstract once all traversals support it.
* @param sum Result of the computation.
* @param <T> Type of the computation.
*/
public <T> void printOnTraversalDone( T sum ) {
throw new UnsupportedOperationException( "This method is a required override for new traversal engines. Please port your traversal engine to the new style." );
}
public abstract <T> void printOnTraversalDone( T sum );
/**
* Called after a traversal to print out information about the traversal process
@ -296,7 +293,7 @@ public abstract class TraversalEngine {
final long curTime = System.currentTimeMillis();
final double elapsed = (curTime - startTime) / 1000.0;
logger.info(String.format("Total runtime %.2f secs, %.2f min, %.2f hours%n", elapsed, elapsed / 60, elapsed / 3600));
logger.info(String.format("Traversal skipped %d reads out of %d total (%.2f%%)",
logger.info(String.format("Traversal skipped %d valid reads out of %d total (%.2f%%)",
TraversalStatistics.nSkippedReads,
TraversalStatistics.nReads,
(TraversalStatistics.nSkippedReads * 100.0) / TraversalStatistics.nReads));
@ -327,67 +324,6 @@ public abstract class TraversalEngine {
return true;
}
@Deprecated
protected StingSAMIterator wrapReadsIterator( final Iterator<SAMRecord> rawIterator, final boolean enableVerification ) {
// Reads sourceInfo is gone by this point in the traversal engine. Stub in a null and rely on the iterator to
// throw an exception if reads info isn't present.
StingSAMIterator wrappedIterator = StingSAMIteratorAdapter.adapt(null,rawIterator);
wrappedIterator = applyDecoratingIterators(enableVerification,wrappedIterator);
return wrappedIterator;
}
/**
* Repackage instance variables and call static method.
* TODO: This method's days are numbered.
* @param enableVerification
* @param wrappedIterator
* @return
*/
protected StingSAMIterator applyDecoratingIterators( final boolean enableVerification, final StingSAMIterator wrappedIterator ) {
return applyDecoratingIterators(enableVerification,
wrappedIterator,
DOWNSAMPLE_BY_FRACTION ? downsamplingFraction : null,
filterZeroMappingQualityReads,
beSafeP);
}
/**
* WARNING: In TraversalEngine for backward compatibility ONLY. Reads are not used as the data source, only as parameters
* for validation.
*/
public static StingSAMIterator applyDecoratingIterators(boolean enableVerification,
StingSAMIterator wrappedIterator,
Double downsamplingFraction,
Boolean filterZeroMappingQualityReads,
Boolean beSafeP) {
// NOTE: this (and other filtering) should be done before on-the-fly sorting
// as there is no reason to sort something that we will end of throwing away
if (downsamplingFraction != null)
wrappedIterator = new DownsampleIterator(wrappedIterator, downsamplingFraction);
if (beSafeP != null && beSafeP && enableVerification)
wrappedIterator = new VerifyingSamIterator(wrappedIterator);
if ( filterZeroMappingQualityReads != null && filterZeroMappingQualityReads )
wrappedIterator = StingSAMIteratorAdapter.adapt(wrappedIterator.getSourceInfo(),
new FilteringIterator(wrappedIterator, new ZeroMappingQualityReadFilterFunc()));
return wrappedIterator;
}
private static class ZeroMappingQualityReadFilterFunc implements SamRecordFilter {
public boolean filterOut(SAMRecord rec) {
if (rec.getMappingQuality() == 0) {
//System.out.printf("Filtering 0 mapping quality read %s%n", rec.format());
return true;
} else {
return false;
}
}
}
protected SAMFileReader initializeSAMFile(File samFile) {
// todo: fixme, this is a hack to try out dynamic merging
if ( samFile.toString().endsWith(".list") ) {

View File

@ -124,10 +124,6 @@ public class TraverseReads extends TraversalEngine {
// get the genome loc from the read
GenomeLoc site = GenomeLocParser.createGenomeLoc(read);
// this is a temporary fix to deal with unmapped reads which "map" to a given location and have a MAPPED flag set
if ( site.getStop() != -1 && site.getStop() < site.getStart() )
continue;
// Jump forward in the reference to this locus location
locus = new LocusContext(site, Arrays.asList(read), Arrays.asList(0));

View File

@ -0,0 +1,52 @@
/*
* Copyright (c) 2009 The Broad Institute
* Permission is hereby granted, free of charge, to any person
* obtaining a copy of this software and associated documentation
* files (the "Software"), to deal in the Software without
* restriction, including without limitation the rights to use,
* copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following
* conditions:
*
* The above copyright notice and this permission notice shall be
* included in all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
* OTHER DEALINGS IN THE SOFTWARE.
*/
package org.broadinstitute.sting.utils.sam;
import org.broadinstitute.sting.utils.StingException;
/**
* Represents a validation failure, usually triggered by an inconsistency internal to the read.
* @author hanna
* @version 0.1
*/
public class SAMReadValidationException extends StingException {
/**
* Create a validation exception with only a message; no other traceback info is provided.
* @param message The message to pass along to the user.
*/
public SAMReadValidationException(String message) {
super(message);
}
/**
* Create a validation exception with a message and traceback info.
* @param message The message to pass along to the user.
* @param inner The exception to nest.
*/
public SAMReadValidationException(String message,Throwable inner) {
super(message,inner);
}
}

View File

@ -0,0 +1,81 @@
/*
* Copyright (c) 2009 The Broad Institute
* Permission is hereby granted, free of charge, to any person
* obtaining a copy of this software and associated documentation
* files (the "Software"), to deal in the Software without
* restriction, including without limitation the rights to use,
* copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following
* conditions:
*
* The above copyright notice and this permission notice shall be
* included in all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
* OTHER DEALINGS IN THE SOFTWARE.
*/
package org.broadinstitute.sting.utils.sam;
import net.sf.samtools.SAMRecord;
/**
* Validates reads against a specific set of criteria. If it finds a
* read that fails to meet the given criteria, it will throw an exception.
* The caller can decide whether to ignore the error, hide the read
* from the user, or blow up in a spectacular ball of fire.
*
* @author hanna
* @version 0.1
*/
public class SAMReadValidator {
/**
* Validate the sam read against a list of criteria that are known to cause failures in the GATK.
* Throw an exception if the read fails.
* @param read the read to validate. Must not be null.
*/
public static void validate( SAMRecord read ) throws SAMReadValidationException {
checkInvalidAlignmentStart(read);
checkInvalidAlignmentEnd(read);
checkCigarDisagreesWithAlignment(read);
}
/**
* Check for the case in which the alignment start is inconsistent with the read unmapped flag.
* @param read The read to validate.
*/
private static void checkInvalidAlignmentStart( SAMRecord read ) {
if( !read.getReadUnmappedFlag() && read.getAlignmentStart() == SAMRecord.NO_ALIGNMENT_START )
throw new SAMReadValidationException("read is not flagged as 'unmapped', but alignment start is NO_ALIGNMENT_START");
if( !read.getReadUnmappedFlag() && read.getAlignmentStart() == -1 )
throw new SAMReadValidationException("Read is not flagged as 'unmapped', but alignment start is -1");
}
/**
* Check for invalid end of alignments.
* @param read The read to validate.
*/
private static void checkInvalidAlignmentEnd( SAMRecord read ) {
if( read.getAlignmentEnd() != -1 && read.getAlignmentEnd() < read.getAlignmentStart() )
throw new SAMReadValidationException("Alignment ends prior to its beginning");
}
/**
* Check for inconsistencies between the cigar string and the
* @param read The read to validate.
*/
private static void checkCigarDisagreesWithAlignment( SAMRecord read ) {
if( read.getAlignmentStart() != -1 &&
read.getAlignmentStart() != SAMRecord.NO_ALIGNMENT_START &&
read.getAlignmentBlocks().size() == 0 )
throw new SAMReadValidationException("Read has a valid alignment start, but the CIGAR string is empty");
}
}

View File

@ -0,0 +1,69 @@
/*
* Copyright (c) 2009 The Broad Institute
* Permission is hereby granted, free of charge, to any person
* obtaining a copy of this software and associated documentation
* files (the "Software"), to deal in the Software without
* restriction, including without limitation the rights to use,
* copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following
* conditions:
*
* The above copyright notice and this permission notice shall be
* included in all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
* OTHER DEALINGS IN THE SOFTWARE.
*/
package org.broadinstitute.sting.utils.sam;
import java.util.*;
/**
* Collects a series of violations to our SAM read validation criteria.
*
* @author hanna
* @version 0.1
*/
public class SAMReadViolationHistogram {
private Map<String,Long> violations = new HashMap<String,Long>();
/**
* Add a violation to the database of violations. For now, track
* only the number of occurrrences of a given violation.
* @param violation Violation to add, generated by the SAMReadValidator.
*/
public void addViolation( SAMReadValidationException violation ) {
String message = violation.getMessage();
if( !violations.containsKey( message ) )
violations.put( message, 0L );
violations.put(message,violations.get(message)+1);
}
public long getViolationCount() {
long totalViolations = 0L;
Collection<Long> violationCounts = violations.values();
for( Long violationCount: violationCounts )
totalViolations += violationCount;
return totalViolations;
}
public String toString() {
if( getViolationCount() == 0 )
return "";
StringBuilder violationOutput = new StringBuilder();
violationOutput.append("Eliminated malformed reads for the following reasons:\n");
for(Map.Entry<String,Long> violation: violations.entrySet())
violationOutput.append( String.format("\t%s: %d%n", violation.getKey(), violation.getValue()) );
return violationOutput.toString();
}
}