Basic infrastructure for filtering malformed reads.
git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@1178 348d0f76-0448-11de-a6fe-93d51630548a
This commit is contained in:
parent
b9d533042e
commit
5735c87581
|
|
@ -178,14 +178,6 @@ public abstract class LocusView extends LocusContextIterator implements View {
|
|||
TraversalStatistics.nBadAlignments++;
|
||||
result = true;
|
||||
why = "No alignment start";
|
||||
} else if (rec.getAlignmentEnd() != -1 && rec.getAlignmentEnd() < rec.getAlignmentStart() ) {
|
||||
TraversalStatistics.nBadAlignments++;
|
||||
result = true;
|
||||
why = "Alignment ends before it starts";
|
||||
} else if (rec.getAlignmentStart() != -1 && rec.getAlignmentBlocks().size() == 0) {
|
||||
TraversalStatistics.nBadAlignments++;
|
||||
result = true;
|
||||
why = "Alignment cigar string is invalid";
|
||||
} else if (rec.getDuplicateReadFlag()) {
|
||||
TraversalStatistics.nDuplicates++;
|
||||
result = true;
|
||||
|
|
|
|||
|
|
@ -3,6 +3,8 @@ package org.broadinstitute.sting.gatk.datasources.simpleDataSources;
|
|||
import net.sf.samtools.SAMFileHeader;
|
||||
import net.sf.samtools.SAMRecord;
|
||||
import net.sf.samtools.util.CloseableIterator;
|
||||
import net.sf.picard.filter.FilteringIterator;
|
||||
import net.sf.picard.filter.SamRecordFilter;
|
||||
import org.apache.log4j.Logger;
|
||||
import org.broadinstitute.sting.gatk.datasources.shards.ReadShard;
|
||||
import org.broadinstitute.sting.gatk.datasources.shards.Shard;
|
||||
|
|
@ -12,6 +14,8 @@ import org.broadinstitute.sting.gatk.traversals.TraversalEngine;
|
|||
import org.broadinstitute.sting.utils.GenomeLoc;
|
||||
import org.broadinstitute.sting.utils.StingException;
|
||||
import org.broadinstitute.sting.utils.GenomeLocParser;
|
||||
import org.broadinstitute.sting.utils.sam.SAMReadValidator;
|
||||
import org.broadinstitute.sting.utils.sam.SAMReadViolationHistogram;
|
||||
|
||||
import java.io.File;
|
||||
import java.util.List;
|
||||
|
|
@ -54,6 +58,11 @@ public class SAMDataSource implements SimpleDataSource {
|
|||
/** Backing support for reads. */
|
||||
private final Reads reads;
|
||||
|
||||
/**
|
||||
* A histogram of exactly what reads were removed from the input stream and why.
|
||||
*/
|
||||
private SAMReadViolationHistogram violations = new SAMReadViolationHistogram();
|
||||
|
||||
/** our log, which we want to capture anything from this class */
|
||||
protected static Logger logger = Logger.getLogger(SAMDataSource.class);
|
||||
|
||||
|
|
@ -73,6 +82,14 @@ public class SAMDataSource implements SimpleDataSource {
|
|||
// A pool of SAM iterators.
|
||||
private SAMIteratorPool iteratorPool = null;
|
||||
|
||||
/**
|
||||
* Returns a histogram of reads that were screened out, grouped by the nature of the error.
|
||||
* @return Histogram of reads. Will not be null.
|
||||
*/
|
||||
public SAMReadViolationHistogram getViolationHistogram() {
|
||||
return violations;
|
||||
}
|
||||
|
||||
/**
|
||||
* constructor, given sam files
|
||||
*
|
||||
|
|
@ -94,26 +111,12 @@ public class SAMDataSource implements SimpleDataSource {
|
|||
}
|
||||
|
||||
/**
|
||||
* For unit testing, add a custom iterator pool.
|
||||
* Gets the (potentially merged) SAM file header.
|
||||
*
|
||||
* @param iteratorPool Custom mock iterator pool.
|
||||
* @return SAM file header.
|
||||
*/
|
||||
void setResourcePool( SAMIteratorPool iteratorPool ) {
|
||||
this.iteratorPool = iteratorPool;
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* <p>
|
||||
* seekLocus
|
||||
* </p>
|
||||
*
|
||||
* @param location the genome location to extract data for
|
||||
*
|
||||
* @return an iterator for that region
|
||||
*/
|
||||
public StingSAMIterator seekLocus( GenomeLoc location ) throws SimpleDataSourceLoadException {
|
||||
return iteratorPool.iterator(new MappedStreamSegment(location));
|
||||
public SAMFileHeader getHeader() {
|
||||
return iteratorPool.getHeader();
|
||||
}
|
||||
|
||||
/**
|
||||
|
|
@ -130,14 +133,14 @@ public class SAMDataSource implements SimpleDataSource {
|
|||
StingSAMIterator iterator = null;
|
||||
if (shard.getShardType() == Shard.ShardType.READ) {
|
||||
iterator = seekRead((ReadShard) shard);
|
||||
iterator = TraversalEngine.applyDecoratingIterators(true,
|
||||
iterator = applyDecoratingIterators(true,
|
||||
iterator,
|
||||
reads.getDownsamplingFraction(),
|
||||
reads.getFilterZeroMappingQualityReads(),
|
||||
reads.getSafetyChecking());
|
||||
} else if (shard.getShardType() == Shard.ShardType.LOCUS || shard.getShardType() == Shard.ShardType.INTERVAL) {
|
||||
iterator = seekLocus(shard.getGenomeLoc());
|
||||
iterator = TraversalEngine.applyDecoratingIterators(false,
|
||||
iterator = applyDecoratingIterators(false,
|
||||
iterator,
|
||||
reads.getDownsamplingFraction(),
|
||||
reads.getFilterZeroMappingQualityReads(),
|
||||
|
|
@ -151,12 +154,16 @@ public class SAMDataSource implements SimpleDataSource {
|
|||
|
||||
|
||||
/**
|
||||
* Gets the (potentially merged) SAM file header.
|
||||
* <p>
|
||||
* seekLocus
|
||||
* </p>
|
||||
*
|
||||
* @return SAM file header.
|
||||
* @param location the genome location to extract data for
|
||||
*
|
||||
* @return an iterator for that region
|
||||
*/
|
||||
public SAMFileHeader getHeader() {
|
||||
return iteratorPool.getHeader();
|
||||
private StingSAMIterator seekLocus( GenomeLoc location ) throws SimpleDataSourceLoadException {
|
||||
return iteratorPool.iterator(new MappedStreamSegment(location));
|
||||
}
|
||||
|
||||
|
||||
|
|
@ -209,6 +216,15 @@ public class SAMDataSource implements SimpleDataSource {
|
|||
includeUnmappedReads = seeUnMappedReads;
|
||||
}
|
||||
|
||||
/**
|
||||
* For unit testing, add a custom iterator pool.
|
||||
*
|
||||
* @param iteratorPool Custom mock iterator pool.
|
||||
*/
|
||||
void setResourcePool( SAMIteratorPool iteratorPool ) {
|
||||
this.iteratorPool = iteratorPool;
|
||||
}
|
||||
|
||||
/**
|
||||
* Retrieve unmapped reads.
|
||||
*
|
||||
|
|
@ -333,6 +349,49 @@ public class SAMDataSource implements SimpleDataSource {
|
|||
return bound;
|
||||
}
|
||||
|
||||
/**
|
||||
* Filter reads based on user-specified criteria.
|
||||
*
|
||||
* @param enableVerification Verify the order of reads.
|
||||
* @param wrappedIterator the raw data source.
|
||||
* @param downsamplingFraction whether and how much to downsample the reads themselves (not at a locus).
|
||||
* @param filterZeroMappingQualityReads whether to filter zero mapping quality reads.
|
||||
* @param beSafeP Another trigger for the verifying iterator? TODO: look into this.
|
||||
* @return An iterator wrapped with filters reflecting the passed-in parameters. Will not be null.
|
||||
*/
|
||||
private StingSAMIterator applyDecoratingIterators(boolean enableVerification,
|
||||
StingSAMIterator wrappedIterator,
|
||||
Double downsamplingFraction,
|
||||
Boolean filterZeroMappingQualityReads,
|
||||
Boolean beSafeP) {
|
||||
wrappedIterator = new MalformedSAMFilteringIterator(wrappedIterator,violations);
|
||||
|
||||
// NOTE: this (and other filtering) should be done before on-the-fly sorting
|
||||
// as there is no reason to sort something that we will end of throwing away
|
||||
if (downsamplingFraction != null)
|
||||
wrappedIterator = new DownsampleIterator(wrappedIterator, downsamplingFraction);
|
||||
|
||||
if (beSafeP != null && beSafeP && enableVerification)
|
||||
wrappedIterator = new VerifyingSamIterator(wrappedIterator);
|
||||
|
||||
if ( filterZeroMappingQualityReads != null && filterZeroMappingQualityReads )
|
||||
wrappedIterator = StingSAMIteratorAdapter.adapt(wrappedIterator.getSourceInfo(),
|
||||
new FilteringIterator(wrappedIterator, new ZeroMappingQualityReadFilterFunc()));
|
||||
|
||||
return wrappedIterator;
|
||||
}
|
||||
|
||||
private static class ZeroMappingQualityReadFilterFunc implements SamRecordFilter {
|
||||
public boolean filterOut(SAMRecord rec) {
|
||||
if (rec.getMappingQuality() == 0) {
|
||||
//System.out.printf("Filtering 0 mapping quality read %s%n", rec.format());
|
||||
return true;
|
||||
} else {
|
||||
return false;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
class SAMIteratorPool extends ResourcePool<ReadStreamPointer, StingSAMIterator> {
|
||||
|
|
|
|||
|
|
@ -135,8 +135,9 @@ public class HierarchicalMicroScheduler extends MicroScheduler implements Hierar
|
|||
throw new StingException("Unable to retrieve result", ex);
|
||||
}
|
||||
|
||||
traversalEngine.printOnTraversalDone(result);
|
||||
walker.onTraversalDone(result);
|
||||
walker.onTraversalDone(result);
|
||||
|
||||
printOnTraversalDone(result);
|
||||
|
||||
return result;
|
||||
}
|
||||
|
|
|
|||
|
|
@ -49,7 +49,7 @@ public class LinearMicroScheduler extends MicroScheduler {
|
|||
|
||||
Object result = accumulator.finishTraversal();
|
||||
|
||||
traversalEngine.printOnTraversalDone(result);
|
||||
printOnTraversalDone(result);
|
||||
|
||||
return accumulator;
|
||||
}
|
||||
|
|
|
|||
|
|
@ -207,6 +207,15 @@ public abstract class MicroScheduler {
|
|||
return new ShardDataProvider(shard, reads, reference, rods);
|
||||
}
|
||||
|
||||
/**
|
||||
* Print summary information for the analysis.
|
||||
* @param sum The final reduce output.
|
||||
*/
|
||||
protected void printOnTraversalDone(Object sum) {
|
||||
logger.info(String.format("%n%s",reads.getViolationHistogram()));
|
||||
traversalEngine.printOnTraversalDone(sum);
|
||||
}
|
||||
|
||||
/**
|
||||
* Gets a data source for the given set of reads.
|
||||
*
|
||||
|
|
|
|||
|
|
@ -0,0 +1,134 @@
|
|||
/*
|
||||
* Copyright (c) 2009 The Broad Institute
|
||||
* Permission is hereby granted, free of charge, to any person
|
||||
* obtaining a copy of this software and associated documentation
|
||||
* files (the "Software"), to deal in the Software without
|
||||
* restriction, including without limitation the rights to use,
|
||||
* copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
* copies of the Software, and to permit persons to whom the
|
||||
* Software is furnished to do so, subject to the following
|
||||
* conditions:
|
||||
*
|
||||
* The above copyright notice and this permission notice shall be
|
||||
* included in all copies or substantial portions of the Software.
|
||||
*
|
||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
||||
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
|
||||
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
||||
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
|
||||
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
|
||||
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
||||
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
|
||||
* OTHER DEALINGS IN THE SOFTWARE.
|
||||
*/
|
||||
|
||||
package org.broadinstitute.sting.gatk.iterators;
|
||||
|
||||
import net.sf.samtools.SAMRecord;
|
||||
import org.broadinstitute.sting.gatk.Reads;
|
||||
import org.broadinstitute.sting.utils.sam.SAMReadValidator;
|
||||
import org.broadinstitute.sting.utils.sam.SAMReadValidationException;
|
||||
import org.broadinstitute.sting.utils.sam.SAMReadViolationHistogram;
|
||||
|
||||
import java.util.NoSuchElementException;
|
||||
|
||||
/**
|
||||
* A decorating iterator that examines the stream of reads, discarding those
|
||||
* that fail to meet a minimum standard for consumption by the GATK.
|
||||
*
|
||||
* @author hanna
|
||||
* @version 0.1
|
||||
*/
|
||||
|
||||
public class MalformedSAMFilteringIterator implements StingSAMIterator {
|
||||
/**
|
||||
* The wrapped iterator. Get reads from here.
|
||||
*/
|
||||
private StingSAMIterator wrapped = null;
|
||||
|
||||
/**
|
||||
* Collector for SAM read violations.
|
||||
*/
|
||||
private SAMReadViolationHistogram violations = null;
|
||||
|
||||
/**
|
||||
* The next SAMRecord to return.;
|
||||
*/
|
||||
private SAMRecord next = null;
|
||||
|
||||
/**
|
||||
* Creates a new MalformedSAMFilteringIterator, and provides a collector for the count
|
||||
* @param wrapped The wrapped iterator to use as backing data.
|
||||
* @param violations A structure to hold a breakdown of validator violations.
|
||||
*/
|
||||
public MalformedSAMFilteringIterator( StingSAMIterator wrapped, SAMReadViolationHistogram violations ) {
|
||||
this.wrapped = wrapped;
|
||||
this.violations = violations;
|
||||
seedNext();
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns source information about the reads.
|
||||
* @return
|
||||
*/
|
||||
public Reads getSourceInfo() {
|
||||
return wrapped.getSourceInfo();
|
||||
}
|
||||
|
||||
/**
|
||||
* Gets an iterator, helpful for foreach loops.
|
||||
* @return An iterator sharing the same state variables as the current iterator.
|
||||
*/
|
||||
public StingSAMIterator iterator() {
|
||||
return this;
|
||||
}
|
||||
|
||||
/**
|
||||
* Checks to see whether there's a
|
||||
* @return True if a next is available, false otherwise.
|
||||
*/
|
||||
public boolean hasNext() {
|
||||
return next != null;
|
||||
}
|
||||
|
||||
/**
|
||||
* Gets the next valid record from the stream.
|
||||
* @return Next valid record.
|
||||
*/
|
||||
public SAMRecord next() {
|
||||
SAMRecord current = next;
|
||||
if( current == null )
|
||||
throw new NoSuchElementException("MalformedSAMFilteringIterator: supply of reads is exhausted.");
|
||||
seedNext();
|
||||
return current;
|
||||
}
|
||||
|
||||
/**
|
||||
* Closes the wrapped iterator.
|
||||
*/
|
||||
public void close() {
|
||||
wrapped.close();
|
||||
}
|
||||
|
||||
/**
|
||||
* Looks ahead for the next valid SAMRecord.
|
||||
*/
|
||||
protected void seedNext() {
|
||||
next = null;
|
||||
while( wrapped.hasNext() && next == null ) {
|
||||
SAMRecord toTest = wrapped.next();
|
||||
try {
|
||||
SAMReadValidator.validate(toTest);
|
||||
next = toTest;
|
||||
}
|
||||
catch ( SAMReadValidationException ex ) {
|
||||
violations.addViolation(ex);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Throws an exception. Remove is not supported.
|
||||
*/
|
||||
public void remove() { throw new UnsupportedOperationException("Unable to remove from a StingSAMIterator"); }
|
||||
}
|
||||
|
|
@ -275,13 +275,10 @@ public abstract class TraversalEngine {
|
|||
|
||||
/**
|
||||
* A passthrough method so that subclasses can report which types of traversals they're using.
|
||||
* TODO: Make this method abstract once all traversals support it.
|
||||
* @param sum Result of the computation.
|
||||
* @param <T> Type of the computation.
|
||||
*/
|
||||
public <T> void printOnTraversalDone( T sum ) {
|
||||
throw new UnsupportedOperationException( "This method is a required override for new traversal engines. Please port your traversal engine to the new style." );
|
||||
}
|
||||
public abstract <T> void printOnTraversalDone( T sum );
|
||||
|
||||
/**
|
||||
* Called after a traversal to print out information about the traversal process
|
||||
|
|
@ -296,7 +293,7 @@ public abstract class TraversalEngine {
|
|||
final long curTime = System.currentTimeMillis();
|
||||
final double elapsed = (curTime - startTime) / 1000.0;
|
||||
logger.info(String.format("Total runtime %.2f secs, %.2f min, %.2f hours%n", elapsed, elapsed / 60, elapsed / 3600));
|
||||
logger.info(String.format("Traversal skipped %d reads out of %d total (%.2f%%)",
|
||||
logger.info(String.format("Traversal skipped %d valid reads out of %d total (%.2f%%)",
|
||||
TraversalStatistics.nSkippedReads,
|
||||
TraversalStatistics.nReads,
|
||||
(TraversalStatistics.nSkippedReads * 100.0) / TraversalStatistics.nReads));
|
||||
|
|
@ -327,67 +324,6 @@ public abstract class TraversalEngine {
|
|||
return true;
|
||||
}
|
||||
|
||||
|
||||
@Deprecated
|
||||
protected StingSAMIterator wrapReadsIterator( final Iterator<SAMRecord> rawIterator, final boolean enableVerification ) {
|
||||
// Reads sourceInfo is gone by this point in the traversal engine. Stub in a null and rely on the iterator to
|
||||
// throw an exception if reads info isn't present.
|
||||
StingSAMIterator wrappedIterator = StingSAMIteratorAdapter.adapt(null,rawIterator);
|
||||
wrappedIterator = applyDecoratingIterators(enableVerification,wrappedIterator);
|
||||
|
||||
return wrappedIterator;
|
||||
}
|
||||
|
||||
/**
|
||||
* Repackage instance variables and call static method.
|
||||
* TODO: This method's days are numbered.
|
||||
* @param enableVerification
|
||||
* @param wrappedIterator
|
||||
* @return
|
||||
*/
|
||||
protected StingSAMIterator applyDecoratingIterators( final boolean enableVerification, final StingSAMIterator wrappedIterator ) {
|
||||
return applyDecoratingIterators(enableVerification,
|
||||
wrappedIterator,
|
||||
DOWNSAMPLE_BY_FRACTION ? downsamplingFraction : null,
|
||||
filterZeroMappingQualityReads,
|
||||
beSafeP);
|
||||
}
|
||||
|
||||
/**
|
||||
* WARNING: In TraversalEngine for backward compatibility ONLY. Reads are not used as the data source, only as parameters
|
||||
* for validation.
|
||||
*/
|
||||
public static StingSAMIterator applyDecoratingIterators(boolean enableVerification,
|
||||
StingSAMIterator wrappedIterator,
|
||||
Double downsamplingFraction,
|
||||
Boolean filterZeroMappingQualityReads,
|
||||
Boolean beSafeP) {
|
||||
// NOTE: this (and other filtering) should be done before on-the-fly sorting
|
||||
// as there is no reason to sort something that we will end of throwing away
|
||||
if (downsamplingFraction != null)
|
||||
wrappedIterator = new DownsampleIterator(wrappedIterator, downsamplingFraction);
|
||||
|
||||
if (beSafeP != null && beSafeP && enableVerification)
|
||||
wrappedIterator = new VerifyingSamIterator(wrappedIterator);
|
||||
|
||||
if ( filterZeroMappingQualityReads != null && filterZeroMappingQualityReads )
|
||||
wrappedIterator = StingSAMIteratorAdapter.adapt(wrappedIterator.getSourceInfo(),
|
||||
new FilteringIterator(wrappedIterator, new ZeroMappingQualityReadFilterFunc()));
|
||||
|
||||
return wrappedIterator;
|
||||
}
|
||||
|
||||
private static class ZeroMappingQualityReadFilterFunc implements SamRecordFilter {
|
||||
public boolean filterOut(SAMRecord rec) {
|
||||
if (rec.getMappingQuality() == 0) {
|
||||
//System.out.printf("Filtering 0 mapping quality read %s%n", rec.format());
|
||||
return true;
|
||||
} else {
|
||||
return false;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
protected SAMFileReader initializeSAMFile(File samFile) {
|
||||
// todo: fixme, this is a hack to try out dynamic merging
|
||||
if ( samFile.toString().endsWith(".list") ) {
|
||||
|
|
|
|||
|
|
@ -124,10 +124,6 @@ public class TraverseReads extends TraversalEngine {
|
|||
// get the genome loc from the read
|
||||
GenomeLoc site = GenomeLocParser.createGenomeLoc(read);
|
||||
|
||||
// this is a temporary fix to deal with unmapped reads which "map" to a given location and have a MAPPED flag set
|
||||
if ( site.getStop() != -1 && site.getStop() < site.getStart() )
|
||||
continue;
|
||||
|
||||
// Jump forward in the reference to this locus location
|
||||
locus = new LocusContext(site, Arrays.asList(read), Arrays.asList(0));
|
||||
|
||||
|
|
|
|||
|
|
@ -0,0 +1,52 @@
|
|||
/*
|
||||
* Copyright (c) 2009 The Broad Institute
|
||||
* Permission is hereby granted, free of charge, to any person
|
||||
* obtaining a copy of this software and associated documentation
|
||||
* files (the "Software"), to deal in the Software without
|
||||
* restriction, including without limitation the rights to use,
|
||||
* copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
* copies of the Software, and to permit persons to whom the
|
||||
* Software is furnished to do so, subject to the following
|
||||
* conditions:
|
||||
*
|
||||
* The above copyright notice and this permission notice shall be
|
||||
* included in all copies or substantial portions of the Software.
|
||||
*
|
||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
||||
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
|
||||
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
||||
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
|
||||
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
|
||||
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
||||
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
|
||||
* OTHER DEALINGS IN THE SOFTWARE.
|
||||
*/
|
||||
|
||||
package org.broadinstitute.sting.utils.sam;
|
||||
|
||||
import org.broadinstitute.sting.utils.StingException;
|
||||
|
||||
/**
|
||||
* Represents a validation failure, usually triggered by an inconsistency internal to the read.
|
||||
* @author hanna
|
||||
* @version 0.1
|
||||
*/
|
||||
|
||||
public class SAMReadValidationException extends StingException {
|
||||
/**
|
||||
* Create a validation exception with only a message; no other traceback info is provided.
|
||||
* @param message The message to pass along to the user.
|
||||
*/
|
||||
public SAMReadValidationException(String message) {
|
||||
super(message);
|
||||
}
|
||||
|
||||
/**
|
||||
* Create a validation exception with a message and traceback info.
|
||||
* @param message The message to pass along to the user.
|
||||
* @param inner The exception to nest.
|
||||
*/
|
||||
public SAMReadValidationException(String message,Throwable inner) {
|
||||
super(message,inner);
|
||||
}
|
||||
}
|
||||
|
|
@ -0,0 +1,81 @@
|
|||
/*
|
||||
* Copyright (c) 2009 The Broad Institute
|
||||
* Permission is hereby granted, free of charge, to any person
|
||||
* obtaining a copy of this software and associated documentation
|
||||
* files (the "Software"), to deal in the Software without
|
||||
* restriction, including without limitation the rights to use,
|
||||
* copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
* copies of the Software, and to permit persons to whom the
|
||||
* Software is furnished to do so, subject to the following
|
||||
* conditions:
|
||||
*
|
||||
* The above copyright notice and this permission notice shall be
|
||||
* included in all copies or substantial portions of the Software.
|
||||
*
|
||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
||||
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
|
||||
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
||||
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
|
||||
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
|
||||
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
||||
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
|
||||
* OTHER DEALINGS IN THE SOFTWARE.
|
||||
*/
|
||||
|
||||
package org.broadinstitute.sting.utils.sam;
|
||||
|
||||
import net.sf.samtools.SAMRecord;
|
||||
|
||||
/**
|
||||
* Validates reads against a specific set of criteria. If it finds a
|
||||
* read that fails to meet the given criteria, it will throw an exception.
|
||||
* The caller can decide whether to ignore the error, hide the read
|
||||
* from the user, or blow up in a spectacular ball of fire.
|
||||
*
|
||||
* @author hanna
|
||||
* @version 0.1
|
||||
*/
|
||||
public class SAMReadValidator {
|
||||
/**
|
||||
* Validate the sam read against a list of criteria that are known to cause failures in the GATK.
|
||||
* Throw an exception if the read fails.
|
||||
* @param read the read to validate. Must not be null.
|
||||
*/
|
||||
public static void validate( SAMRecord read ) throws SAMReadValidationException {
|
||||
checkInvalidAlignmentStart(read);
|
||||
checkInvalidAlignmentEnd(read);
|
||||
checkCigarDisagreesWithAlignment(read);
|
||||
}
|
||||
|
||||
/**
|
||||
* Check for the case in which the alignment start is inconsistent with the read unmapped flag.
|
||||
* @param read The read to validate.
|
||||
*/
|
||||
private static void checkInvalidAlignmentStart( SAMRecord read ) {
|
||||
if( !read.getReadUnmappedFlag() && read.getAlignmentStart() == SAMRecord.NO_ALIGNMENT_START )
|
||||
throw new SAMReadValidationException("read is not flagged as 'unmapped', but alignment start is NO_ALIGNMENT_START");
|
||||
if( !read.getReadUnmappedFlag() && read.getAlignmentStart() == -1 )
|
||||
throw new SAMReadValidationException("Read is not flagged as 'unmapped', but alignment start is -1");
|
||||
}
|
||||
|
||||
/**
|
||||
* Check for invalid end of alignments.
|
||||
* @param read The read to validate.
|
||||
*/
|
||||
private static void checkInvalidAlignmentEnd( SAMRecord read ) {
|
||||
if( read.getAlignmentEnd() != -1 && read.getAlignmentEnd() < read.getAlignmentStart() )
|
||||
throw new SAMReadValidationException("Alignment ends prior to its beginning");
|
||||
}
|
||||
|
||||
/**
|
||||
* Check for inconsistencies between the cigar string and the
|
||||
* @param read The read to validate.
|
||||
*/
|
||||
private static void checkCigarDisagreesWithAlignment( SAMRecord read ) {
|
||||
if( read.getAlignmentStart() != -1 &&
|
||||
read.getAlignmentStart() != SAMRecord.NO_ALIGNMENT_START &&
|
||||
read.getAlignmentBlocks().size() == 0 )
|
||||
throw new SAMReadValidationException("Read has a valid alignment start, but the CIGAR string is empty");
|
||||
}
|
||||
}
|
||||
|
||||
|
|
@ -0,0 +1,69 @@
|
|||
/*
|
||||
* Copyright (c) 2009 The Broad Institute
|
||||
* Permission is hereby granted, free of charge, to any person
|
||||
* obtaining a copy of this software and associated documentation
|
||||
* files (the "Software"), to deal in the Software without
|
||||
* restriction, including without limitation the rights to use,
|
||||
* copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
* copies of the Software, and to permit persons to whom the
|
||||
* Software is furnished to do so, subject to the following
|
||||
* conditions:
|
||||
*
|
||||
* The above copyright notice and this permission notice shall be
|
||||
* included in all copies or substantial portions of the Software.
|
||||
*
|
||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
||||
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
|
||||
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
||||
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
|
||||
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
|
||||
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
||||
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
|
||||
* OTHER DEALINGS IN THE SOFTWARE.
|
||||
*/
|
||||
|
||||
package org.broadinstitute.sting.utils.sam;
|
||||
|
||||
import java.util.*;
|
||||
|
||||
/**
|
||||
* Collects a series of violations to our SAM read validation criteria.
|
||||
*
|
||||
* @author hanna
|
||||
* @version 0.1
|
||||
*/
|
||||
public class SAMReadViolationHistogram {
|
||||
private Map<String,Long> violations = new HashMap<String,Long>();
|
||||
|
||||
/**
|
||||
* Add a violation to the database of violations. For now, track
|
||||
* only the number of occurrrences of a given violation.
|
||||
* @param violation Violation to add, generated by the SAMReadValidator.
|
||||
*/
|
||||
public void addViolation( SAMReadValidationException violation ) {
|
||||
String message = violation.getMessage();
|
||||
if( !violations.containsKey( message ) )
|
||||
violations.put( message, 0L );
|
||||
violations.put(message,violations.get(message)+1);
|
||||
}
|
||||
|
||||
public long getViolationCount() {
|
||||
long totalViolations = 0L;
|
||||
Collection<Long> violationCounts = violations.values();
|
||||
for( Long violationCount: violationCounts )
|
||||
totalViolations += violationCount;
|
||||
return totalViolations;
|
||||
}
|
||||
|
||||
public String toString() {
|
||||
if( getViolationCount() == 0 )
|
||||
return "";
|
||||
|
||||
StringBuilder violationOutput = new StringBuilder();
|
||||
violationOutput.append("Eliminated malformed reads for the following reasons:\n");
|
||||
for(Map.Entry<String,Long> violation: violations.entrySet())
|
||||
violationOutput.append( String.format("\t%s: %d%n", violation.getKey(), violation.getValue()) );
|
||||
|
||||
return violationOutput.toString();
|
||||
}
|
||||
}
|
||||
Loading…
Reference in New Issue