diff --git a/java/src/org/broadinstitute/sting/gatk/datasources/providers/LocusView.java b/java/src/org/broadinstitute/sting/gatk/datasources/providers/LocusView.java
index 40fa2a003..49b805fcd 100755
--- a/java/src/org/broadinstitute/sting/gatk/datasources/providers/LocusView.java
+++ b/java/src/org/broadinstitute/sting/gatk/datasources/providers/LocusView.java
@@ -178,14 +178,6 @@ public abstract class LocusView extends LocusContextIterator implements View {
TraversalStatistics.nBadAlignments++;
result = true;
why = "No alignment start";
- } else if (rec.getAlignmentEnd() != -1 && rec.getAlignmentEnd() < rec.getAlignmentStart() ) {
- TraversalStatistics.nBadAlignments++;
- result = true;
- why = "Alignment ends before it starts";
- } else if (rec.getAlignmentStart() != -1 && rec.getAlignmentBlocks().size() == 0) {
- TraversalStatistics.nBadAlignments++;
- result = true;
- why = "Alignment cigar string is invalid";
} else if (rec.getDuplicateReadFlag()) {
TraversalStatistics.nDuplicates++;
result = true;
diff --git a/java/src/org/broadinstitute/sting/gatk/datasources/simpleDataSources/SAMDataSource.java b/java/src/org/broadinstitute/sting/gatk/datasources/simpleDataSources/SAMDataSource.java
index cf2edaf43..1879771f5 100755
--- a/java/src/org/broadinstitute/sting/gatk/datasources/simpleDataSources/SAMDataSource.java
+++ b/java/src/org/broadinstitute/sting/gatk/datasources/simpleDataSources/SAMDataSource.java
@@ -3,6 +3,8 @@ package org.broadinstitute.sting.gatk.datasources.simpleDataSources;
import net.sf.samtools.SAMFileHeader;
import net.sf.samtools.SAMRecord;
import net.sf.samtools.util.CloseableIterator;
+import net.sf.picard.filter.FilteringIterator;
+import net.sf.picard.filter.SamRecordFilter;
import org.apache.log4j.Logger;
import org.broadinstitute.sting.gatk.datasources.shards.ReadShard;
import org.broadinstitute.sting.gatk.datasources.shards.Shard;
@@ -12,6 +14,8 @@ import org.broadinstitute.sting.gatk.traversals.TraversalEngine;
import org.broadinstitute.sting.utils.GenomeLoc;
import org.broadinstitute.sting.utils.StingException;
import org.broadinstitute.sting.utils.GenomeLocParser;
+import org.broadinstitute.sting.utils.sam.SAMReadValidator;
+import org.broadinstitute.sting.utils.sam.SAMReadViolationHistogram;
import java.io.File;
import java.util.List;
@@ -54,6 +58,11 @@ public class SAMDataSource implements SimpleDataSource {
/** Backing support for reads. */
private final Reads reads;
+ /**
+ * A histogram of exactly what reads were removed from the input stream and why.
+ */
+ private SAMReadViolationHistogram violations = new SAMReadViolationHistogram();
+
/** our log, which we want to capture anything from this class */
protected static Logger logger = Logger.getLogger(SAMDataSource.class);
@@ -73,6 +82,14 @@ public class SAMDataSource implements SimpleDataSource {
// A pool of SAM iterators.
private SAMIteratorPool iteratorPool = null;
+ /**
+ * Returns a histogram of reads that were screened out, grouped by the nature of the error.
+ * @return Histogram of reads. Will not be null.
+ */
+ public SAMReadViolationHistogram getViolationHistogram() {
+ return violations;
+ }
+
/**
* constructor, given sam files
*
@@ -94,26 +111,12 @@ public class SAMDataSource implements SimpleDataSource {
}
/**
- * For unit testing, add a custom iterator pool.
+ * Gets the (potentially merged) SAM file header.
*
- * @param iteratorPool Custom mock iterator pool.
+ * @return SAM file header.
*/
- void setResourcePool( SAMIteratorPool iteratorPool ) {
- this.iteratorPool = iteratorPool;
- }
-
-
- /**
- *
- * seekLocus
- *
- *
- * @param location the genome location to extract data for
- *
- * @return an iterator for that region
- */
- public StingSAMIterator seekLocus( GenomeLoc location ) throws SimpleDataSourceLoadException {
- return iteratorPool.iterator(new MappedStreamSegment(location));
+ public SAMFileHeader getHeader() {
+ return iteratorPool.getHeader();
}
/**
@@ -130,14 +133,14 @@ public class SAMDataSource implements SimpleDataSource {
StingSAMIterator iterator = null;
if (shard.getShardType() == Shard.ShardType.READ) {
iterator = seekRead((ReadShard) shard);
- iterator = TraversalEngine.applyDecoratingIterators(true,
+ iterator = applyDecoratingIterators(true,
iterator,
reads.getDownsamplingFraction(),
reads.getFilterZeroMappingQualityReads(),
reads.getSafetyChecking());
} else if (shard.getShardType() == Shard.ShardType.LOCUS || shard.getShardType() == Shard.ShardType.INTERVAL) {
iterator = seekLocus(shard.getGenomeLoc());
- iterator = TraversalEngine.applyDecoratingIterators(false,
+ iterator = applyDecoratingIterators(false,
iterator,
reads.getDownsamplingFraction(),
reads.getFilterZeroMappingQualityReads(),
@@ -151,12 +154,16 @@ public class SAMDataSource implements SimpleDataSource {
/**
- * Gets the (potentially merged) SAM file header.
+ *
+ * seekLocus
+ *
*
- * @return SAM file header.
+ * @param location the genome location to extract data for
+ *
+ * @return an iterator for that region
*/
- public SAMFileHeader getHeader() {
- return iteratorPool.getHeader();
+ private StingSAMIterator seekLocus( GenomeLoc location ) throws SimpleDataSourceLoadException {
+ return iteratorPool.iterator(new MappedStreamSegment(location));
}
@@ -209,6 +216,15 @@ public class SAMDataSource implements SimpleDataSource {
includeUnmappedReads = seeUnMappedReads;
}
+ /**
+ * For unit testing, add a custom iterator pool.
+ *
+ * @param iteratorPool Custom mock iterator pool.
+ */
+ void setResourcePool( SAMIteratorPool iteratorPool ) {
+ this.iteratorPool = iteratorPool;
+ }
+
/**
* Retrieve unmapped reads.
*
@@ -333,6 +349,49 @@ public class SAMDataSource implements SimpleDataSource {
return bound;
}
+ /**
+ * Filter reads based on user-specified criteria.
+ *
+ * @param enableVerification Verify the order of reads.
+ * @param wrappedIterator the raw data source.
+ * @param downsamplingFraction whether and how much to downsample the reads themselves (not at a locus).
+ * @param filterZeroMappingQualityReads whether to filter zero mapping quality reads.
+ * @param beSafeP Another trigger for the verifying iterator? TODO: look into this.
+ * @return An iterator wrapped with filters reflecting the passed-in parameters. Will not be null.
+ */
+ private StingSAMIterator applyDecoratingIterators(boolean enableVerification,
+ StingSAMIterator wrappedIterator,
+ Double downsamplingFraction,
+ Boolean filterZeroMappingQualityReads,
+ Boolean beSafeP) {
+ wrappedIterator = new MalformedSAMFilteringIterator(wrappedIterator,violations);
+
+ // NOTE: this (and other filtering) should be done before on-the-fly sorting
+ // as there is no reason to sort something that we will end of throwing away
+ if (downsamplingFraction != null)
+ wrappedIterator = new DownsampleIterator(wrappedIterator, downsamplingFraction);
+
+ if (beSafeP != null && beSafeP && enableVerification)
+ wrappedIterator = new VerifyingSamIterator(wrappedIterator);
+
+ if ( filterZeroMappingQualityReads != null && filterZeroMappingQualityReads )
+ wrappedIterator = StingSAMIteratorAdapter.adapt(wrappedIterator.getSourceInfo(),
+ new FilteringIterator(wrappedIterator, new ZeroMappingQualityReadFilterFunc()));
+
+ return wrappedIterator;
+ }
+
+ private static class ZeroMappingQualityReadFilterFunc implements SamRecordFilter {
+ public boolean filterOut(SAMRecord rec) {
+ if (rec.getMappingQuality() == 0) {
+ //System.out.printf("Filtering 0 mapping quality read %s%n", rec.format());
+ return true;
+ } else {
+ return false;
+ }
+ }
+ }
+
}
class SAMIteratorPool extends ResourcePool {
diff --git a/java/src/org/broadinstitute/sting/gatk/executive/HierarchicalMicroScheduler.java b/java/src/org/broadinstitute/sting/gatk/executive/HierarchicalMicroScheduler.java
index d903a2b4e..e056dec9c 100755
--- a/java/src/org/broadinstitute/sting/gatk/executive/HierarchicalMicroScheduler.java
+++ b/java/src/org/broadinstitute/sting/gatk/executive/HierarchicalMicroScheduler.java
@@ -135,8 +135,9 @@ public class HierarchicalMicroScheduler extends MicroScheduler implements Hierar
throw new StingException("Unable to retrieve result", ex);
}
- traversalEngine.printOnTraversalDone(result);
- walker.onTraversalDone(result);
+ walker.onTraversalDone(result);
+
+ printOnTraversalDone(result);
return result;
}
diff --git a/java/src/org/broadinstitute/sting/gatk/executive/LinearMicroScheduler.java b/java/src/org/broadinstitute/sting/gatk/executive/LinearMicroScheduler.java
index 42250cf67..dd2b4a558 100644
--- a/java/src/org/broadinstitute/sting/gatk/executive/LinearMicroScheduler.java
+++ b/java/src/org/broadinstitute/sting/gatk/executive/LinearMicroScheduler.java
@@ -49,7 +49,7 @@ public class LinearMicroScheduler extends MicroScheduler {
Object result = accumulator.finishTraversal();
- traversalEngine.printOnTraversalDone(result);
+ printOnTraversalDone(result);
return accumulator;
}
diff --git a/java/src/org/broadinstitute/sting/gatk/executive/MicroScheduler.java b/java/src/org/broadinstitute/sting/gatk/executive/MicroScheduler.java
index 263cda5a9..63834177a 100755
--- a/java/src/org/broadinstitute/sting/gatk/executive/MicroScheduler.java
+++ b/java/src/org/broadinstitute/sting/gatk/executive/MicroScheduler.java
@@ -207,6 +207,15 @@ public abstract class MicroScheduler {
return new ShardDataProvider(shard, reads, reference, rods);
}
+ /**
+ * Print summary information for the analysis.
+ * @param sum The final reduce output.
+ */
+ protected void printOnTraversalDone(Object sum) {
+ logger.info(String.format("%n%s",reads.getViolationHistogram()));
+ traversalEngine.printOnTraversalDone(sum);
+ }
+
/**
* Gets a data source for the given set of reads.
*
diff --git a/java/src/org/broadinstitute/sting/gatk/iterators/MalformedSAMFilteringIterator.java b/java/src/org/broadinstitute/sting/gatk/iterators/MalformedSAMFilteringIterator.java
new file mode 100644
index 000000000..0b001d146
--- /dev/null
+++ b/java/src/org/broadinstitute/sting/gatk/iterators/MalformedSAMFilteringIterator.java
@@ -0,0 +1,134 @@
+/*
+ * Copyright (c) 2009 The Broad Institute
+ * Permission is hereby granted, free of charge, to any person
+ * obtaining a copy of this software and associated documentation
+ * files (the "Software"), to deal in the Software without
+ * restriction, including without limitation the rights to use,
+ * copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following
+ * conditions:
+ *
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
+ * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
+ * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
+ * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+ * OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+package org.broadinstitute.sting.gatk.iterators;
+
+import net.sf.samtools.SAMRecord;
+import org.broadinstitute.sting.gatk.Reads;
+import org.broadinstitute.sting.utils.sam.SAMReadValidator;
+import org.broadinstitute.sting.utils.sam.SAMReadValidationException;
+import org.broadinstitute.sting.utils.sam.SAMReadViolationHistogram;
+
+import java.util.NoSuchElementException;
+
+/**
+ * A decorating iterator that examines the stream of reads, discarding those
+ * that fail to meet a minimum standard for consumption by the GATK.
+ *
+ * @author hanna
+ * @version 0.1
+ */
+
+public class MalformedSAMFilteringIterator implements StingSAMIterator {
+ /**
+ * The wrapped iterator. Get reads from here.
+ */
+ private StingSAMIterator wrapped = null;
+
+ /**
+ * Collector for SAM read violations.
+ */
+ private SAMReadViolationHistogram violations = null;
+
+ /**
+ * The next SAMRecord to return.;
+ */
+ private SAMRecord next = null;
+
+ /**
+ * Creates a new MalformedSAMFilteringIterator, and provides a collector for the count
+ * @param wrapped The wrapped iterator to use as backing data.
+ * @param violations A structure to hold a breakdown of validator violations.
+ */
+ public MalformedSAMFilteringIterator( StingSAMIterator wrapped, SAMReadViolationHistogram violations ) {
+ this.wrapped = wrapped;
+ this.violations = violations;
+ seedNext();
+ }
+
+ /**
+ * Returns source information about the reads.
+ * @return
+ */
+ public Reads getSourceInfo() {
+ return wrapped.getSourceInfo();
+ }
+
+ /**
+ * Gets an iterator, helpful for foreach loops.
+ * @return An iterator sharing the same state variables as the current iterator.
+ */
+ public StingSAMIterator iterator() {
+ return this;
+ }
+
+ /**
+ * Checks to see whether there's a
+ * @return True if a next is available, false otherwise.
+ */
+ public boolean hasNext() {
+ return next != null;
+ }
+
+ /**
+ * Gets the next valid record from the stream.
+ * @return Next valid record.
+ */
+ public SAMRecord next() {
+ SAMRecord current = next;
+ if( current == null )
+ throw new NoSuchElementException("MalformedSAMFilteringIterator: supply of reads is exhausted.");
+ seedNext();
+ return current;
+ }
+
+ /**
+ * Closes the wrapped iterator.
+ */
+ public void close() {
+ wrapped.close();
+ }
+
+ /**
+ * Looks ahead for the next valid SAMRecord.
+ */
+ protected void seedNext() {
+ next = null;
+ while( wrapped.hasNext() && next == null ) {
+ SAMRecord toTest = wrapped.next();
+ try {
+ SAMReadValidator.validate(toTest);
+ next = toTest;
+ }
+ catch ( SAMReadValidationException ex ) {
+ violations.addViolation(ex);
+ }
+ }
+ }
+
+ /**
+ * Throws an exception. Remove is not supported.
+ */
+ public void remove() { throw new UnsupportedOperationException("Unable to remove from a StingSAMIterator"); }
+}
diff --git a/java/src/org/broadinstitute/sting/gatk/traversals/TraversalEngine.java b/java/src/org/broadinstitute/sting/gatk/traversals/TraversalEngine.java
index 939df9c31..bfd44589c 100755
--- a/java/src/org/broadinstitute/sting/gatk/traversals/TraversalEngine.java
+++ b/java/src/org/broadinstitute/sting/gatk/traversals/TraversalEngine.java
@@ -275,13 +275,10 @@ public abstract class TraversalEngine {
/**
* A passthrough method so that subclasses can report which types of traversals they're using.
- * TODO: Make this method abstract once all traversals support it.
* @param sum Result of the computation.
* @param Type of the computation.
*/
- public void printOnTraversalDone( T sum ) {
- throw new UnsupportedOperationException( "This method is a required override for new traversal engines. Please port your traversal engine to the new style." );
- }
+ public abstract void printOnTraversalDone( T sum );
/**
* Called after a traversal to print out information about the traversal process
@@ -296,7 +293,7 @@ public abstract class TraversalEngine {
final long curTime = System.currentTimeMillis();
final double elapsed = (curTime - startTime) / 1000.0;
logger.info(String.format("Total runtime %.2f secs, %.2f min, %.2f hours%n", elapsed, elapsed / 60, elapsed / 3600));
- logger.info(String.format("Traversal skipped %d reads out of %d total (%.2f%%)",
+ logger.info(String.format("Traversal skipped %d valid reads out of %d total (%.2f%%)",
TraversalStatistics.nSkippedReads,
TraversalStatistics.nReads,
(TraversalStatistics.nSkippedReads * 100.0) / TraversalStatistics.nReads));
@@ -327,67 +324,6 @@ public abstract class TraversalEngine {
return true;
}
-
- @Deprecated
- protected StingSAMIterator wrapReadsIterator( final Iterator rawIterator, final boolean enableVerification ) {
- // Reads sourceInfo is gone by this point in the traversal engine. Stub in a null and rely on the iterator to
- // throw an exception if reads info isn't present.
- StingSAMIterator wrappedIterator = StingSAMIteratorAdapter.adapt(null,rawIterator);
- wrappedIterator = applyDecoratingIterators(enableVerification,wrappedIterator);
-
- return wrappedIterator;
- }
-
- /**
- * Repackage instance variables and call static method.
- * TODO: This method's days are numbered.
- * @param enableVerification
- * @param wrappedIterator
- * @return
- */
- protected StingSAMIterator applyDecoratingIterators( final boolean enableVerification, final StingSAMIterator wrappedIterator ) {
- return applyDecoratingIterators(enableVerification,
- wrappedIterator,
- DOWNSAMPLE_BY_FRACTION ? downsamplingFraction : null,
- filterZeroMappingQualityReads,
- beSafeP);
- }
-
- /**
- * WARNING: In TraversalEngine for backward compatibility ONLY. Reads are not used as the data source, only as parameters
- * for validation.
- */
- public static StingSAMIterator applyDecoratingIterators(boolean enableVerification,
- StingSAMIterator wrappedIterator,
- Double downsamplingFraction,
- Boolean filterZeroMappingQualityReads,
- Boolean beSafeP) {
- // NOTE: this (and other filtering) should be done before on-the-fly sorting
- // as there is no reason to sort something that we will end of throwing away
- if (downsamplingFraction != null)
- wrappedIterator = new DownsampleIterator(wrappedIterator, downsamplingFraction);
-
- if (beSafeP != null && beSafeP && enableVerification)
- wrappedIterator = new VerifyingSamIterator(wrappedIterator);
-
- if ( filterZeroMappingQualityReads != null && filterZeroMappingQualityReads )
- wrappedIterator = StingSAMIteratorAdapter.adapt(wrappedIterator.getSourceInfo(),
- new FilteringIterator(wrappedIterator, new ZeroMappingQualityReadFilterFunc()));
-
- return wrappedIterator;
- }
-
- private static class ZeroMappingQualityReadFilterFunc implements SamRecordFilter {
- public boolean filterOut(SAMRecord rec) {
- if (rec.getMappingQuality() == 0) {
- //System.out.printf("Filtering 0 mapping quality read %s%n", rec.format());
- return true;
- } else {
- return false;
- }
- }
- }
-
protected SAMFileReader initializeSAMFile(File samFile) {
// todo: fixme, this is a hack to try out dynamic merging
if ( samFile.toString().endsWith(".list") ) {
diff --git a/java/src/org/broadinstitute/sting/gatk/traversals/TraverseReads.java b/java/src/org/broadinstitute/sting/gatk/traversals/TraverseReads.java
index c0a887b3c..b74ea7223 100755
--- a/java/src/org/broadinstitute/sting/gatk/traversals/TraverseReads.java
+++ b/java/src/org/broadinstitute/sting/gatk/traversals/TraverseReads.java
@@ -124,10 +124,6 @@ public class TraverseReads extends TraversalEngine {
// get the genome loc from the read
GenomeLoc site = GenomeLocParser.createGenomeLoc(read);
- // this is a temporary fix to deal with unmapped reads which "map" to a given location and have a MAPPED flag set
- if ( site.getStop() != -1 && site.getStop() < site.getStart() )
- continue;
-
// Jump forward in the reference to this locus location
locus = new LocusContext(site, Arrays.asList(read), Arrays.asList(0));
diff --git a/java/src/org/broadinstitute/sting/utils/sam/SAMReadValidationException.java b/java/src/org/broadinstitute/sting/utils/sam/SAMReadValidationException.java
new file mode 100644
index 000000000..7b1c7db03
--- /dev/null
+++ b/java/src/org/broadinstitute/sting/utils/sam/SAMReadValidationException.java
@@ -0,0 +1,52 @@
+/*
+ * Copyright (c) 2009 The Broad Institute
+ * Permission is hereby granted, free of charge, to any person
+ * obtaining a copy of this software and associated documentation
+ * files (the "Software"), to deal in the Software without
+ * restriction, including without limitation the rights to use,
+ * copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following
+ * conditions:
+ *
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
+ * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
+ * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
+ * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+ * OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+package org.broadinstitute.sting.utils.sam;
+
+import org.broadinstitute.sting.utils.StingException;
+
+/**
+ * Represents a validation failure, usually triggered by an inconsistency internal to the read.
+ * @author hanna
+ * @version 0.1
+ */
+
+public class SAMReadValidationException extends StingException {
+ /**
+ * Create a validation exception with only a message; no other traceback info is provided.
+ * @param message The message to pass along to the user.
+ */
+ public SAMReadValidationException(String message) {
+ super(message);
+ }
+
+ /**
+ * Create a validation exception with a message and traceback info.
+ * @param message The message to pass along to the user.
+ * @param inner The exception to nest.
+ */
+ public SAMReadValidationException(String message,Throwable inner) {
+ super(message,inner);
+ }
+}
diff --git a/java/src/org/broadinstitute/sting/utils/sam/SAMReadValidator.java b/java/src/org/broadinstitute/sting/utils/sam/SAMReadValidator.java
new file mode 100644
index 000000000..3f118d1d7
--- /dev/null
+++ b/java/src/org/broadinstitute/sting/utils/sam/SAMReadValidator.java
@@ -0,0 +1,81 @@
+/*
+ * Copyright (c) 2009 The Broad Institute
+ * Permission is hereby granted, free of charge, to any person
+ * obtaining a copy of this software and associated documentation
+ * files (the "Software"), to deal in the Software without
+ * restriction, including without limitation the rights to use,
+ * copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following
+ * conditions:
+ *
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
+ * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
+ * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
+ * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+ * OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+package org.broadinstitute.sting.utils.sam;
+
+import net.sf.samtools.SAMRecord;
+
+/**
+ * Validates reads against a specific set of criteria. If it finds a
+ * read that fails to meet the given criteria, it will throw an exception.
+ * The caller can decide whether to ignore the error, hide the read
+ * from the user, or blow up in a spectacular ball of fire.
+ *
+ * @author hanna
+ * @version 0.1
+ */
+public class SAMReadValidator {
+ /**
+ * Validate the sam read against a list of criteria that are known to cause failures in the GATK.
+ * Throw an exception if the read fails.
+ * @param read the read to validate. Must not be null.
+ */
+ public static void validate( SAMRecord read ) throws SAMReadValidationException {
+ checkInvalidAlignmentStart(read);
+ checkInvalidAlignmentEnd(read);
+ checkCigarDisagreesWithAlignment(read);
+ }
+
+ /**
+ * Check for the case in which the alignment start is inconsistent with the read unmapped flag.
+ * @param read The read to validate.
+ */
+ private static void checkInvalidAlignmentStart( SAMRecord read ) {
+ if( !read.getReadUnmappedFlag() && read.getAlignmentStart() == SAMRecord.NO_ALIGNMENT_START )
+ throw new SAMReadValidationException("read is not flagged as 'unmapped', but alignment start is NO_ALIGNMENT_START");
+ if( !read.getReadUnmappedFlag() && read.getAlignmentStart() == -1 )
+ throw new SAMReadValidationException("Read is not flagged as 'unmapped', but alignment start is -1");
+ }
+
+ /**
+ * Check for invalid end of alignments.
+ * @param read The read to validate.
+ */
+ private static void checkInvalidAlignmentEnd( SAMRecord read ) {
+ if( read.getAlignmentEnd() != -1 && read.getAlignmentEnd() < read.getAlignmentStart() )
+ throw new SAMReadValidationException("Alignment ends prior to its beginning");
+ }
+
+ /**
+ * Check for inconsistencies between the cigar string and the
+ * @param read The read to validate.
+ */
+ private static void checkCigarDisagreesWithAlignment( SAMRecord read ) {
+ if( read.getAlignmentStart() != -1 &&
+ read.getAlignmentStart() != SAMRecord.NO_ALIGNMENT_START &&
+ read.getAlignmentBlocks().size() == 0 )
+ throw new SAMReadValidationException("Read has a valid alignment start, but the CIGAR string is empty");
+ }
+}
+
diff --git a/java/src/org/broadinstitute/sting/utils/sam/SAMReadViolationHistogram.java b/java/src/org/broadinstitute/sting/utils/sam/SAMReadViolationHistogram.java
new file mode 100644
index 000000000..cdfcd36fa
--- /dev/null
+++ b/java/src/org/broadinstitute/sting/utils/sam/SAMReadViolationHistogram.java
@@ -0,0 +1,69 @@
+/*
+ * Copyright (c) 2009 The Broad Institute
+ * Permission is hereby granted, free of charge, to any person
+ * obtaining a copy of this software and associated documentation
+ * files (the "Software"), to deal in the Software without
+ * restriction, including without limitation the rights to use,
+ * copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following
+ * conditions:
+ *
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
+ * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
+ * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
+ * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+ * OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+package org.broadinstitute.sting.utils.sam;
+
+import java.util.*;
+
+/**
+ * Collects a series of violations to our SAM read validation criteria.
+ *
+ * @author hanna
+ * @version 0.1
+ */
+public class SAMReadViolationHistogram {
+ private Map violations = new HashMap();
+
+ /**
+ * Add a violation to the database of violations. For now, track
+ * only the number of occurrrences of a given violation.
+ * @param violation Violation to add, generated by the SAMReadValidator.
+ */
+ public void addViolation( SAMReadValidationException violation ) {
+ String message = violation.getMessage();
+ if( !violations.containsKey( message ) )
+ violations.put( message, 0L );
+ violations.put(message,violations.get(message)+1);
+ }
+
+ public long getViolationCount() {
+ long totalViolations = 0L;
+ Collection violationCounts = violations.values();
+ for( Long violationCount: violationCounts )
+ totalViolations += violationCount;
+ return totalViolations;
+ }
+
+ public String toString() {
+ if( getViolationCount() == 0 )
+ return "";
+
+ StringBuilder violationOutput = new StringBuilder();
+ violationOutput.append("Eliminated malformed reads for the following reasons:\n");
+ for(Map.Entry violation: violations.entrySet())
+ violationOutput.append( String.format("\t%s: %d%n", violation.getKey(), violation.getValue()) );
+
+ return violationOutput.toString();
+ }
+}