Filter reads whose alignment starts past the end of the contig to which it allegedly aligns.

git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@1188 348d0f76-0448-11de-a6fe-93d51630548a
This commit is contained in:
hanna 2009-07-07 22:27:44 +00:00
parent 194b75613b
commit 4ba2194b5e
3 changed files with 35 additions and 12 deletions

View File

@ -58,11 +58,6 @@ public class SAMDataSource implements SimpleDataSource {
/** Backing support for reads. */
private final Reads reads;
/**
* A histogram of exactly what reads were removed from the input stream and why.
*/
private SAMReadViolationHistogram violations = new SAMReadViolationHistogram();
/** our log, which we want to capture anything from this class */
protected static Logger logger = Logger.getLogger(SAMDataSource.class);
@ -87,7 +82,7 @@ public class SAMDataSource implements SimpleDataSource {
* @return Histogram of reads. Will not be null.
*/
public SAMReadViolationHistogram getViolationHistogram() {
return violations;
return iteratorPool.getViolationHistogram();
}
/**
@ -364,8 +359,6 @@ public class SAMDataSource implements SimpleDataSource {
Double downsamplingFraction,
Boolean filterZeroMappingQualityReads,
Boolean beSafeP) {
wrappedIterator = new MalformedSAMFilteringIterator(wrappedIterator,violations);
// NOTE: this (and other filtering) should be done before on-the-fly sorting
// as there is no reason to sort something that we will end of throwing away
if (downsamplingFraction != null)
@ -398,6 +391,11 @@ class SAMIteratorPool extends ResourcePool<ReadStreamPointer, StingSAMIterator>
/** Source information about the reads. */
protected Reads reads;
/**
* A histogram of exactly what reads were removed from the input stream and why.
*/
private SAMReadViolationHistogram violations = new SAMReadViolationHistogram();
/** Is this a by-reads traversal or a by-locus? */
protected boolean queryOverlapping;
@ -422,6 +420,14 @@ class SAMIteratorPool extends ResourcePool<ReadStreamPointer, StingSAMIterator>
return header;
}
/**
* Returns a histogram of reads that were screened out, grouped by the nature of the error.
* @return Histogram of reads. Will not be null.
*/
public SAMReadViolationHistogram getViolationHistogram() {
return violations;
}
protected ReadStreamPointer selectBestExistingResource( DataStreamSegment segment, List<ReadStreamPointer> pointers ) {
for (ReadStreamPointer pointer : pointers) {
if (pointer.canAccessSegmentEfficiently(segment)) {
@ -446,7 +452,7 @@ class SAMIteratorPool extends ResourcePool<ReadStreamPointer, StingSAMIterator>
iterator = streamPointer.getReadsOverlapping((MappedStreamSegment) segment);
}
return new ReleasingIterator(iterator);
return new ReleasingIterator(new MalformedSAMFilteringIterator(header, iterator, violations));
}
protected void closeResource( ReadStreamPointer resource ) {

View File

@ -25,6 +25,7 @@
package org.broadinstitute.sting.gatk.iterators;
import net.sf.samtools.SAMRecord;
import net.sf.samtools.SAMFileHeader;
import org.broadinstitute.sting.gatk.Reads;
import org.broadinstitute.sting.utils.sam.SAMReadValidator;
import org.broadinstitute.sting.utils.sam.SAMReadValidationException;
@ -41,6 +42,11 @@ import java.util.NoSuchElementException;
*/
public class MalformedSAMFilteringIterator implements StingSAMIterator {
/**
* The header to validate reads against.
*/
private SAMFileHeader header = null;
/**
* The wrapped iterator. Get reads from here.
*/
@ -61,7 +67,8 @@ public class MalformedSAMFilteringIterator implements StingSAMIterator {
* @param wrapped The wrapped iterator to use as backing data.
* @param violations A structure to hold a breakdown of validator violations.
*/
public MalformedSAMFilteringIterator( StingSAMIterator wrapped, SAMReadViolationHistogram violations ) {
public MalformedSAMFilteringIterator( SAMFileHeader header, StingSAMIterator wrapped, SAMReadViolationHistogram violations ) {
this.header = header;
this.wrapped = wrapped;
this.violations = violations;
seedNext();
@ -118,7 +125,7 @@ public class MalformedSAMFilteringIterator implements StingSAMIterator {
while( wrapped.hasNext() && next == null ) {
SAMRecord toTest = wrapped.next();
try {
SAMReadValidator.validate(toTest);
SAMReadValidator.validate(header,toTest);
next = toTest;
}
catch ( SAMReadValidationException ex ) {

View File

@ -25,6 +25,8 @@
package org.broadinstitute.sting.utils.sam;
import net.sf.samtools.SAMRecord;
import net.sf.samtools.SAMFileHeader;
import net.sf.samtools.SAMSequenceRecord;
/**
* Validates reads against a specific set of criteria. If it finds a
@ -41,9 +43,10 @@ public class SAMReadValidator {
* Throw an exception if the read fails.
* @param read the read to validate. Must not be null.
*/
public static void validate( SAMRecord read ) throws SAMReadValidationException {
public static void validate( SAMFileHeader header, SAMRecord read ) throws SAMReadValidationException {
checkInvalidAlignmentStart(read);
checkInvalidAlignmentEnd(read);
checkAlignmentDisagreesWithHeader(header,read);
checkCigarDisagreesWithAlignment(read);
}
@ -67,6 +70,13 @@ public class SAMReadValidator {
throw new SAMReadValidationException("Alignment ends prior to its beginning");
}
private static void checkAlignmentDisagreesWithHeader( SAMFileHeader header, SAMRecord read ) {
SAMSequenceRecord contigHeader = header.getSequence( read.getReferenceIndex() );
if( !read.getReadUnmappedFlag() && read.getAlignmentStart() > contigHeader.getSequenceLength() ) {
throw new SAMReadValidationException("Read is aligned to a point after the end of the contig");
}
}
/**
* Check for inconsistencies between the cigar string and the
* @param read The read to validate.