diff --git a/java/src/org/broadinstitute/sting/gatk/datasources/simpleDataSources/SAMDataSource.java b/java/src/org/broadinstitute/sting/gatk/datasources/simpleDataSources/SAMDataSource.java index 1879771f5..da777e97a 100755 --- a/java/src/org/broadinstitute/sting/gatk/datasources/simpleDataSources/SAMDataSource.java +++ b/java/src/org/broadinstitute/sting/gatk/datasources/simpleDataSources/SAMDataSource.java @@ -58,11 +58,6 @@ public class SAMDataSource implements SimpleDataSource { /** Backing support for reads. */ private final Reads reads; - /** - * A histogram of exactly what reads were removed from the input stream and why. - */ - private SAMReadViolationHistogram violations = new SAMReadViolationHistogram(); - /** our log, which we want to capture anything from this class */ protected static Logger logger = Logger.getLogger(SAMDataSource.class); @@ -87,7 +82,7 @@ public class SAMDataSource implements SimpleDataSource { * @return Histogram of reads. Will not be null. */ public SAMReadViolationHistogram getViolationHistogram() { - return violations; + return iteratorPool.getViolationHistogram(); } /** @@ -364,8 +359,6 @@ public class SAMDataSource implements SimpleDataSource { Double downsamplingFraction, Boolean filterZeroMappingQualityReads, Boolean beSafeP) { - wrappedIterator = new MalformedSAMFilteringIterator(wrappedIterator,violations); - // NOTE: this (and other filtering) should be done before on-the-fly sorting // as there is no reason to sort something that we will end of throwing away if (downsamplingFraction != null) @@ -398,6 +391,11 @@ class SAMIteratorPool extends ResourcePool /** Source information about the reads. */ protected Reads reads; + /** + * A histogram of exactly what reads were removed from the input stream and why. + */ + private SAMReadViolationHistogram violations = new SAMReadViolationHistogram(); + /** Is this a by-reads traversal or a by-locus? */ protected boolean queryOverlapping; @@ -422,6 +420,14 @@ class SAMIteratorPool extends ResourcePool return header; } + /** + * Returns a histogram of reads that were screened out, grouped by the nature of the error. + * @return Histogram of reads. Will not be null. + */ + public SAMReadViolationHistogram getViolationHistogram() { + return violations; + } + protected ReadStreamPointer selectBestExistingResource( DataStreamSegment segment, List pointers ) { for (ReadStreamPointer pointer : pointers) { if (pointer.canAccessSegmentEfficiently(segment)) { @@ -446,7 +452,7 @@ class SAMIteratorPool extends ResourcePool iterator = streamPointer.getReadsOverlapping((MappedStreamSegment) segment); } - return new ReleasingIterator(iterator); + return new ReleasingIterator(new MalformedSAMFilteringIterator(header, iterator, violations)); } protected void closeResource( ReadStreamPointer resource ) { diff --git a/java/src/org/broadinstitute/sting/gatk/iterators/MalformedSAMFilteringIterator.java b/java/src/org/broadinstitute/sting/gatk/iterators/MalformedSAMFilteringIterator.java index 0b001d146..d5c3a0ac7 100644 --- a/java/src/org/broadinstitute/sting/gatk/iterators/MalformedSAMFilteringIterator.java +++ b/java/src/org/broadinstitute/sting/gatk/iterators/MalformedSAMFilteringIterator.java @@ -25,6 +25,7 @@ package org.broadinstitute.sting.gatk.iterators; import net.sf.samtools.SAMRecord; +import net.sf.samtools.SAMFileHeader; import org.broadinstitute.sting.gatk.Reads; import org.broadinstitute.sting.utils.sam.SAMReadValidator; import org.broadinstitute.sting.utils.sam.SAMReadValidationException; @@ -41,6 +42,11 @@ import java.util.NoSuchElementException; */ public class MalformedSAMFilteringIterator implements StingSAMIterator { + /** + * The header to validate reads against. + */ + private SAMFileHeader header = null; + /** * The wrapped iterator. Get reads from here. */ @@ -61,7 +67,8 @@ public class MalformedSAMFilteringIterator implements StingSAMIterator { * @param wrapped The wrapped iterator to use as backing data. * @param violations A structure to hold a breakdown of validator violations. */ - public MalformedSAMFilteringIterator( StingSAMIterator wrapped, SAMReadViolationHistogram violations ) { + public MalformedSAMFilteringIterator( SAMFileHeader header, StingSAMIterator wrapped, SAMReadViolationHistogram violations ) { + this.header = header; this.wrapped = wrapped; this.violations = violations; seedNext(); @@ -118,7 +125,7 @@ public class MalformedSAMFilteringIterator implements StingSAMIterator { while( wrapped.hasNext() && next == null ) { SAMRecord toTest = wrapped.next(); try { - SAMReadValidator.validate(toTest); + SAMReadValidator.validate(header,toTest); next = toTest; } catch ( SAMReadValidationException ex ) { diff --git a/java/src/org/broadinstitute/sting/utils/sam/SAMReadValidator.java b/java/src/org/broadinstitute/sting/utils/sam/SAMReadValidator.java index 3f118d1d7..ba819aa36 100644 --- a/java/src/org/broadinstitute/sting/utils/sam/SAMReadValidator.java +++ b/java/src/org/broadinstitute/sting/utils/sam/SAMReadValidator.java @@ -25,6 +25,8 @@ package org.broadinstitute.sting.utils.sam; import net.sf.samtools.SAMRecord; +import net.sf.samtools.SAMFileHeader; +import net.sf.samtools.SAMSequenceRecord; /** * Validates reads against a specific set of criteria. If it finds a @@ -41,9 +43,10 @@ public class SAMReadValidator { * Throw an exception if the read fails. * @param read the read to validate. Must not be null. */ - public static void validate( SAMRecord read ) throws SAMReadValidationException { + public static void validate( SAMFileHeader header, SAMRecord read ) throws SAMReadValidationException { checkInvalidAlignmentStart(read); checkInvalidAlignmentEnd(read); + checkAlignmentDisagreesWithHeader(header,read); checkCigarDisagreesWithAlignment(read); } @@ -67,6 +70,13 @@ public class SAMReadValidator { throw new SAMReadValidationException("Alignment ends prior to its beginning"); } + private static void checkAlignmentDisagreesWithHeader( SAMFileHeader header, SAMRecord read ) { + SAMSequenceRecord contigHeader = header.getSequence( read.getReferenceIndex() ); + if( !read.getReadUnmappedFlag() && read.getAlignmentStart() > contigHeader.getSequenceLength() ) { + throw new SAMReadValidationException("Read is aligned to a point after the end of the contig"); + } + } + /** * Check for inconsistencies between the cigar string and the * @param read The read to validate.