diff --git a/java/src/org/broadinstitute/sting/gatk/datasources/providers/LocusView.java b/java/src/org/broadinstitute/sting/gatk/datasources/providers/LocusView.java index 40fa2a003..49b805fcd 100755 --- a/java/src/org/broadinstitute/sting/gatk/datasources/providers/LocusView.java +++ b/java/src/org/broadinstitute/sting/gatk/datasources/providers/LocusView.java @@ -178,14 +178,6 @@ public abstract class LocusView extends LocusContextIterator implements View { TraversalStatistics.nBadAlignments++; result = true; why = "No alignment start"; - } else if (rec.getAlignmentEnd() != -1 && rec.getAlignmentEnd() < rec.getAlignmentStart() ) { - TraversalStatistics.nBadAlignments++; - result = true; - why = "Alignment ends before it starts"; - } else if (rec.getAlignmentStart() != -1 && rec.getAlignmentBlocks().size() == 0) { - TraversalStatistics.nBadAlignments++; - result = true; - why = "Alignment cigar string is invalid"; } else if (rec.getDuplicateReadFlag()) { TraversalStatistics.nDuplicates++; result = true; diff --git a/java/src/org/broadinstitute/sting/gatk/datasources/simpleDataSources/SAMDataSource.java b/java/src/org/broadinstitute/sting/gatk/datasources/simpleDataSources/SAMDataSource.java index cf2edaf43..1879771f5 100755 --- a/java/src/org/broadinstitute/sting/gatk/datasources/simpleDataSources/SAMDataSource.java +++ b/java/src/org/broadinstitute/sting/gatk/datasources/simpleDataSources/SAMDataSource.java @@ -3,6 +3,8 @@ package org.broadinstitute.sting.gatk.datasources.simpleDataSources; import net.sf.samtools.SAMFileHeader; import net.sf.samtools.SAMRecord; import net.sf.samtools.util.CloseableIterator; +import net.sf.picard.filter.FilteringIterator; +import net.sf.picard.filter.SamRecordFilter; import org.apache.log4j.Logger; import org.broadinstitute.sting.gatk.datasources.shards.ReadShard; import org.broadinstitute.sting.gatk.datasources.shards.Shard; @@ -12,6 +14,8 @@ import org.broadinstitute.sting.gatk.traversals.TraversalEngine; import org.broadinstitute.sting.utils.GenomeLoc; import org.broadinstitute.sting.utils.StingException; import org.broadinstitute.sting.utils.GenomeLocParser; +import org.broadinstitute.sting.utils.sam.SAMReadValidator; +import org.broadinstitute.sting.utils.sam.SAMReadViolationHistogram; import java.io.File; import java.util.List; @@ -54,6 +58,11 @@ public class SAMDataSource implements SimpleDataSource { /** Backing support for reads. */ private final Reads reads; + /** + * A histogram of exactly what reads were removed from the input stream and why. + */ + private SAMReadViolationHistogram violations = new SAMReadViolationHistogram(); + /** our log, which we want to capture anything from this class */ protected static Logger logger = Logger.getLogger(SAMDataSource.class); @@ -73,6 +82,14 @@ public class SAMDataSource implements SimpleDataSource { // A pool of SAM iterators. private SAMIteratorPool iteratorPool = null; + /** + * Returns a histogram of reads that were screened out, grouped by the nature of the error. + * @return Histogram of reads. Will not be null. + */ + public SAMReadViolationHistogram getViolationHistogram() { + return violations; + } + /** * constructor, given sam files * @@ -94,26 +111,12 @@ public class SAMDataSource implements SimpleDataSource { } /** - * For unit testing, add a custom iterator pool. + * Gets the (potentially merged) SAM file header. * - * @param iteratorPool Custom mock iterator pool. + * @return SAM file header. */ - void setResourcePool( SAMIteratorPool iteratorPool ) { - this.iteratorPool = iteratorPool; - } - - - /** - *

- * seekLocus - *

- * - * @param location the genome location to extract data for - * - * @return an iterator for that region - */ - public StingSAMIterator seekLocus( GenomeLoc location ) throws SimpleDataSourceLoadException { - return iteratorPool.iterator(new MappedStreamSegment(location)); + public SAMFileHeader getHeader() { + return iteratorPool.getHeader(); } /** @@ -130,14 +133,14 @@ public class SAMDataSource implements SimpleDataSource { StingSAMIterator iterator = null; if (shard.getShardType() == Shard.ShardType.READ) { iterator = seekRead((ReadShard) shard); - iterator = TraversalEngine.applyDecoratingIterators(true, + iterator = applyDecoratingIterators(true, iterator, reads.getDownsamplingFraction(), reads.getFilterZeroMappingQualityReads(), reads.getSafetyChecking()); } else if (shard.getShardType() == Shard.ShardType.LOCUS || shard.getShardType() == Shard.ShardType.INTERVAL) { iterator = seekLocus(shard.getGenomeLoc()); - iterator = TraversalEngine.applyDecoratingIterators(false, + iterator = applyDecoratingIterators(false, iterator, reads.getDownsamplingFraction(), reads.getFilterZeroMappingQualityReads(), @@ -151,12 +154,16 @@ public class SAMDataSource implements SimpleDataSource { /** - * Gets the (potentially merged) SAM file header. + *

+ * seekLocus + *

* - * @return SAM file header. + * @param location the genome location to extract data for + * + * @return an iterator for that region */ - public SAMFileHeader getHeader() { - return iteratorPool.getHeader(); + private StingSAMIterator seekLocus( GenomeLoc location ) throws SimpleDataSourceLoadException { + return iteratorPool.iterator(new MappedStreamSegment(location)); } @@ -209,6 +216,15 @@ public class SAMDataSource implements SimpleDataSource { includeUnmappedReads = seeUnMappedReads; } + /** + * For unit testing, add a custom iterator pool. + * + * @param iteratorPool Custom mock iterator pool. + */ + void setResourcePool( SAMIteratorPool iteratorPool ) { + this.iteratorPool = iteratorPool; + } + /** * Retrieve unmapped reads. * @@ -333,6 +349,49 @@ public class SAMDataSource implements SimpleDataSource { return bound; } + /** + * Filter reads based on user-specified criteria. + * + * @param enableVerification Verify the order of reads. + * @param wrappedIterator the raw data source. + * @param downsamplingFraction whether and how much to downsample the reads themselves (not at a locus). + * @param filterZeroMappingQualityReads whether to filter zero mapping quality reads. + * @param beSafeP Another trigger for the verifying iterator? TODO: look into this. + * @return An iterator wrapped with filters reflecting the passed-in parameters. Will not be null. + */ + private StingSAMIterator applyDecoratingIterators(boolean enableVerification, + StingSAMIterator wrappedIterator, + Double downsamplingFraction, + Boolean filterZeroMappingQualityReads, + Boolean beSafeP) { + wrappedIterator = new MalformedSAMFilteringIterator(wrappedIterator,violations); + + // NOTE: this (and other filtering) should be done before on-the-fly sorting + // as there is no reason to sort something that we will end of throwing away + if (downsamplingFraction != null) + wrappedIterator = new DownsampleIterator(wrappedIterator, downsamplingFraction); + + if (beSafeP != null && beSafeP && enableVerification) + wrappedIterator = new VerifyingSamIterator(wrappedIterator); + + if ( filterZeroMappingQualityReads != null && filterZeroMappingQualityReads ) + wrappedIterator = StingSAMIteratorAdapter.adapt(wrappedIterator.getSourceInfo(), + new FilteringIterator(wrappedIterator, new ZeroMappingQualityReadFilterFunc())); + + return wrappedIterator; + } + + private static class ZeroMappingQualityReadFilterFunc implements SamRecordFilter { + public boolean filterOut(SAMRecord rec) { + if (rec.getMappingQuality() == 0) { + //System.out.printf("Filtering 0 mapping quality read %s%n", rec.format()); + return true; + } else { + return false; + } + } + } + } class SAMIteratorPool extends ResourcePool { diff --git a/java/src/org/broadinstitute/sting/gatk/executive/HierarchicalMicroScheduler.java b/java/src/org/broadinstitute/sting/gatk/executive/HierarchicalMicroScheduler.java index d903a2b4e..e056dec9c 100755 --- a/java/src/org/broadinstitute/sting/gatk/executive/HierarchicalMicroScheduler.java +++ b/java/src/org/broadinstitute/sting/gatk/executive/HierarchicalMicroScheduler.java @@ -135,8 +135,9 @@ public class HierarchicalMicroScheduler extends MicroScheduler implements Hierar throw new StingException("Unable to retrieve result", ex); } - traversalEngine.printOnTraversalDone(result); - walker.onTraversalDone(result); + walker.onTraversalDone(result); + + printOnTraversalDone(result); return result; } diff --git a/java/src/org/broadinstitute/sting/gatk/executive/LinearMicroScheduler.java b/java/src/org/broadinstitute/sting/gatk/executive/LinearMicroScheduler.java index 42250cf67..dd2b4a558 100644 --- a/java/src/org/broadinstitute/sting/gatk/executive/LinearMicroScheduler.java +++ b/java/src/org/broadinstitute/sting/gatk/executive/LinearMicroScheduler.java @@ -49,7 +49,7 @@ public class LinearMicroScheduler extends MicroScheduler { Object result = accumulator.finishTraversal(); - traversalEngine.printOnTraversalDone(result); + printOnTraversalDone(result); return accumulator; } diff --git a/java/src/org/broadinstitute/sting/gatk/executive/MicroScheduler.java b/java/src/org/broadinstitute/sting/gatk/executive/MicroScheduler.java index 263cda5a9..63834177a 100755 --- a/java/src/org/broadinstitute/sting/gatk/executive/MicroScheduler.java +++ b/java/src/org/broadinstitute/sting/gatk/executive/MicroScheduler.java @@ -207,6 +207,15 @@ public abstract class MicroScheduler { return new ShardDataProvider(shard, reads, reference, rods); } + /** + * Print summary information for the analysis. + * @param sum The final reduce output. + */ + protected void printOnTraversalDone(Object sum) { + logger.info(String.format("%n%s",reads.getViolationHistogram())); + traversalEngine.printOnTraversalDone(sum); + } + /** * Gets a data source for the given set of reads. * diff --git a/java/src/org/broadinstitute/sting/gatk/iterators/MalformedSAMFilteringIterator.java b/java/src/org/broadinstitute/sting/gatk/iterators/MalformedSAMFilteringIterator.java new file mode 100644 index 000000000..0b001d146 --- /dev/null +++ b/java/src/org/broadinstitute/sting/gatk/iterators/MalformedSAMFilteringIterator.java @@ -0,0 +1,134 @@ +/* + * Copyright (c) 2009 The Broad Institute + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, + * copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following + * conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES + * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT + * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + */ + +package org.broadinstitute.sting.gatk.iterators; + +import net.sf.samtools.SAMRecord; +import org.broadinstitute.sting.gatk.Reads; +import org.broadinstitute.sting.utils.sam.SAMReadValidator; +import org.broadinstitute.sting.utils.sam.SAMReadValidationException; +import org.broadinstitute.sting.utils.sam.SAMReadViolationHistogram; + +import java.util.NoSuchElementException; + +/** + * A decorating iterator that examines the stream of reads, discarding those + * that fail to meet a minimum standard for consumption by the GATK. + * + * @author hanna + * @version 0.1 + */ + +public class MalformedSAMFilteringIterator implements StingSAMIterator { + /** + * The wrapped iterator. Get reads from here. + */ + private StingSAMIterator wrapped = null; + + /** + * Collector for SAM read violations. + */ + private SAMReadViolationHistogram violations = null; + + /** + * The next SAMRecord to return.; + */ + private SAMRecord next = null; + + /** + * Creates a new MalformedSAMFilteringIterator, and provides a collector for the count + * @param wrapped The wrapped iterator to use as backing data. + * @param violations A structure to hold a breakdown of validator violations. + */ + public MalformedSAMFilteringIterator( StingSAMIterator wrapped, SAMReadViolationHistogram violations ) { + this.wrapped = wrapped; + this.violations = violations; + seedNext(); + } + + /** + * Returns source information about the reads. + * @return + */ + public Reads getSourceInfo() { + return wrapped.getSourceInfo(); + } + + /** + * Gets an iterator, helpful for foreach loops. + * @return An iterator sharing the same state variables as the current iterator. + */ + public StingSAMIterator iterator() { + return this; + } + + /** + * Checks to see whether there's a + * @return True if a next is available, false otherwise. + */ + public boolean hasNext() { + return next != null; + } + + /** + * Gets the next valid record from the stream. + * @return Next valid record. + */ + public SAMRecord next() { + SAMRecord current = next; + if( current == null ) + throw new NoSuchElementException("MalformedSAMFilteringIterator: supply of reads is exhausted."); + seedNext(); + return current; + } + + /** + * Closes the wrapped iterator. + */ + public void close() { + wrapped.close(); + } + + /** + * Looks ahead for the next valid SAMRecord. + */ + protected void seedNext() { + next = null; + while( wrapped.hasNext() && next == null ) { + SAMRecord toTest = wrapped.next(); + try { + SAMReadValidator.validate(toTest); + next = toTest; + } + catch ( SAMReadValidationException ex ) { + violations.addViolation(ex); + } + } + } + + /** + * Throws an exception. Remove is not supported. + */ + public void remove() { throw new UnsupportedOperationException("Unable to remove from a StingSAMIterator"); } +} diff --git a/java/src/org/broadinstitute/sting/gatk/traversals/TraversalEngine.java b/java/src/org/broadinstitute/sting/gatk/traversals/TraversalEngine.java index 939df9c31..bfd44589c 100755 --- a/java/src/org/broadinstitute/sting/gatk/traversals/TraversalEngine.java +++ b/java/src/org/broadinstitute/sting/gatk/traversals/TraversalEngine.java @@ -275,13 +275,10 @@ public abstract class TraversalEngine { /** * A passthrough method so that subclasses can report which types of traversals they're using. - * TODO: Make this method abstract once all traversals support it. * @param sum Result of the computation. * @param Type of the computation. */ - public void printOnTraversalDone( T sum ) { - throw new UnsupportedOperationException( "This method is a required override for new traversal engines. Please port your traversal engine to the new style." ); - } + public abstract void printOnTraversalDone( T sum ); /** * Called after a traversal to print out information about the traversal process @@ -296,7 +293,7 @@ public abstract class TraversalEngine { final long curTime = System.currentTimeMillis(); final double elapsed = (curTime - startTime) / 1000.0; logger.info(String.format("Total runtime %.2f secs, %.2f min, %.2f hours%n", elapsed, elapsed / 60, elapsed / 3600)); - logger.info(String.format("Traversal skipped %d reads out of %d total (%.2f%%)", + logger.info(String.format("Traversal skipped %d valid reads out of %d total (%.2f%%)", TraversalStatistics.nSkippedReads, TraversalStatistics.nReads, (TraversalStatistics.nSkippedReads * 100.0) / TraversalStatistics.nReads)); @@ -327,67 +324,6 @@ public abstract class TraversalEngine { return true; } - - @Deprecated - protected StingSAMIterator wrapReadsIterator( final Iterator rawIterator, final boolean enableVerification ) { - // Reads sourceInfo is gone by this point in the traversal engine. Stub in a null and rely on the iterator to - // throw an exception if reads info isn't present. - StingSAMIterator wrappedIterator = StingSAMIteratorAdapter.adapt(null,rawIterator); - wrappedIterator = applyDecoratingIterators(enableVerification,wrappedIterator); - - return wrappedIterator; - } - - /** - * Repackage instance variables and call static method. - * TODO: This method's days are numbered. - * @param enableVerification - * @param wrappedIterator - * @return - */ - protected StingSAMIterator applyDecoratingIterators( final boolean enableVerification, final StingSAMIterator wrappedIterator ) { - return applyDecoratingIterators(enableVerification, - wrappedIterator, - DOWNSAMPLE_BY_FRACTION ? downsamplingFraction : null, - filterZeroMappingQualityReads, - beSafeP); - } - - /** - * WARNING: In TraversalEngine for backward compatibility ONLY. Reads are not used as the data source, only as parameters - * for validation. - */ - public static StingSAMIterator applyDecoratingIterators(boolean enableVerification, - StingSAMIterator wrappedIterator, - Double downsamplingFraction, - Boolean filterZeroMappingQualityReads, - Boolean beSafeP) { - // NOTE: this (and other filtering) should be done before on-the-fly sorting - // as there is no reason to sort something that we will end of throwing away - if (downsamplingFraction != null) - wrappedIterator = new DownsampleIterator(wrappedIterator, downsamplingFraction); - - if (beSafeP != null && beSafeP && enableVerification) - wrappedIterator = new VerifyingSamIterator(wrappedIterator); - - if ( filterZeroMappingQualityReads != null && filterZeroMappingQualityReads ) - wrappedIterator = StingSAMIteratorAdapter.adapt(wrappedIterator.getSourceInfo(), - new FilteringIterator(wrappedIterator, new ZeroMappingQualityReadFilterFunc())); - - return wrappedIterator; - } - - private static class ZeroMappingQualityReadFilterFunc implements SamRecordFilter { - public boolean filterOut(SAMRecord rec) { - if (rec.getMappingQuality() == 0) { - //System.out.printf("Filtering 0 mapping quality read %s%n", rec.format()); - return true; - } else { - return false; - } - } - } - protected SAMFileReader initializeSAMFile(File samFile) { // todo: fixme, this is a hack to try out dynamic merging if ( samFile.toString().endsWith(".list") ) { diff --git a/java/src/org/broadinstitute/sting/gatk/traversals/TraverseReads.java b/java/src/org/broadinstitute/sting/gatk/traversals/TraverseReads.java index c0a887b3c..b74ea7223 100755 --- a/java/src/org/broadinstitute/sting/gatk/traversals/TraverseReads.java +++ b/java/src/org/broadinstitute/sting/gatk/traversals/TraverseReads.java @@ -124,10 +124,6 @@ public class TraverseReads extends TraversalEngine { // get the genome loc from the read GenomeLoc site = GenomeLocParser.createGenomeLoc(read); - // this is a temporary fix to deal with unmapped reads which "map" to a given location and have a MAPPED flag set - if ( site.getStop() != -1 && site.getStop() < site.getStart() ) - continue; - // Jump forward in the reference to this locus location locus = new LocusContext(site, Arrays.asList(read), Arrays.asList(0)); diff --git a/java/src/org/broadinstitute/sting/utils/sam/SAMReadValidationException.java b/java/src/org/broadinstitute/sting/utils/sam/SAMReadValidationException.java new file mode 100644 index 000000000..7b1c7db03 --- /dev/null +++ b/java/src/org/broadinstitute/sting/utils/sam/SAMReadValidationException.java @@ -0,0 +1,52 @@ +/* + * Copyright (c) 2009 The Broad Institute + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, + * copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following + * conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES + * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT + * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + */ + +package org.broadinstitute.sting.utils.sam; + +import org.broadinstitute.sting.utils.StingException; + +/** + * Represents a validation failure, usually triggered by an inconsistency internal to the read. + * @author hanna + * @version 0.1 + */ + +public class SAMReadValidationException extends StingException { + /** + * Create a validation exception with only a message; no other traceback info is provided. + * @param message The message to pass along to the user. + */ + public SAMReadValidationException(String message) { + super(message); + } + + /** + * Create a validation exception with a message and traceback info. + * @param message The message to pass along to the user. + * @param inner The exception to nest. + */ + public SAMReadValidationException(String message,Throwable inner) { + super(message,inner); + } +} diff --git a/java/src/org/broadinstitute/sting/utils/sam/SAMReadValidator.java b/java/src/org/broadinstitute/sting/utils/sam/SAMReadValidator.java new file mode 100644 index 000000000..3f118d1d7 --- /dev/null +++ b/java/src/org/broadinstitute/sting/utils/sam/SAMReadValidator.java @@ -0,0 +1,81 @@ +/* + * Copyright (c) 2009 The Broad Institute + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, + * copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following + * conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES + * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT + * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + */ + +package org.broadinstitute.sting.utils.sam; + +import net.sf.samtools.SAMRecord; + +/** + * Validates reads against a specific set of criteria. If it finds a + * read that fails to meet the given criteria, it will throw an exception. + * The caller can decide whether to ignore the error, hide the read + * from the user, or blow up in a spectacular ball of fire. + * + * @author hanna + * @version 0.1 + */ +public class SAMReadValidator { + /** + * Validate the sam read against a list of criteria that are known to cause failures in the GATK. + * Throw an exception if the read fails. + * @param read the read to validate. Must not be null. + */ + public static void validate( SAMRecord read ) throws SAMReadValidationException { + checkInvalidAlignmentStart(read); + checkInvalidAlignmentEnd(read); + checkCigarDisagreesWithAlignment(read); + } + + /** + * Check for the case in which the alignment start is inconsistent with the read unmapped flag. + * @param read The read to validate. + */ + private static void checkInvalidAlignmentStart( SAMRecord read ) { + if( !read.getReadUnmappedFlag() && read.getAlignmentStart() == SAMRecord.NO_ALIGNMENT_START ) + throw new SAMReadValidationException("read is not flagged as 'unmapped', but alignment start is NO_ALIGNMENT_START"); + if( !read.getReadUnmappedFlag() && read.getAlignmentStart() == -1 ) + throw new SAMReadValidationException("Read is not flagged as 'unmapped', but alignment start is -1"); + } + + /** + * Check for invalid end of alignments. + * @param read The read to validate. + */ + private static void checkInvalidAlignmentEnd( SAMRecord read ) { + if( read.getAlignmentEnd() != -1 && read.getAlignmentEnd() < read.getAlignmentStart() ) + throw new SAMReadValidationException("Alignment ends prior to its beginning"); + } + + /** + * Check for inconsistencies between the cigar string and the + * @param read The read to validate. + */ + private static void checkCigarDisagreesWithAlignment( SAMRecord read ) { + if( read.getAlignmentStart() != -1 && + read.getAlignmentStart() != SAMRecord.NO_ALIGNMENT_START && + read.getAlignmentBlocks().size() == 0 ) + throw new SAMReadValidationException("Read has a valid alignment start, but the CIGAR string is empty"); + } +} + diff --git a/java/src/org/broadinstitute/sting/utils/sam/SAMReadViolationHistogram.java b/java/src/org/broadinstitute/sting/utils/sam/SAMReadViolationHistogram.java new file mode 100644 index 000000000..cdfcd36fa --- /dev/null +++ b/java/src/org/broadinstitute/sting/utils/sam/SAMReadViolationHistogram.java @@ -0,0 +1,69 @@ +/* + * Copyright (c) 2009 The Broad Institute + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, + * copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following + * conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES + * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT + * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + */ + +package org.broadinstitute.sting.utils.sam; + +import java.util.*; + +/** + * Collects a series of violations to our SAM read validation criteria. + * + * @author hanna + * @version 0.1 + */ +public class SAMReadViolationHistogram { + private Map violations = new HashMap(); + + /** + * Add a violation to the database of violations. For now, track + * only the number of occurrrences of a given violation. + * @param violation Violation to add, generated by the SAMReadValidator. + */ + public void addViolation( SAMReadValidationException violation ) { + String message = violation.getMessage(); + if( !violations.containsKey( message ) ) + violations.put( message, 0L ); + violations.put(message,violations.get(message)+1); + } + + public long getViolationCount() { + long totalViolations = 0L; + Collection violationCounts = violations.values(); + for( Long violationCount: violationCounts ) + totalViolations += violationCount; + return totalViolations; + } + + public String toString() { + if( getViolationCount() == 0 ) + return ""; + + StringBuilder violationOutput = new StringBuilder(); + violationOutput.append("Eliminated malformed reads for the following reasons:\n"); + for(Map.Entry violation: violations.entrySet()) + violationOutput.append( String.format("\t%s: %d%n", violation.getKey(), violation.getValue()) ); + + return violationOutput.toString(); + } +}