From 0b16253db366b7f6d8d0782c766ce6c005ef5aea Mon Sep 17 00:00:00 2001 From: aaron Date: Thu, 23 Jul 2009 23:59:48 +0000 Subject: [PATCH] an iterator to fix the problem where read-based interval traversals are getting duplicate reads because reads span the two intervals. git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@1305 348d0f76-0448-11de-a6fe-93d51630548a --- .../simpleDataSources/SAMDataSource.java | 3 +- .../iterators/IntervalOverlapIterator.java | 109 ++++++++++++++++++ .../IntervalOverlapIteratorTest.java | 80 +++++++++++++ 3 files changed, 190 insertions(+), 2 deletions(-) create mode 100644 java/src/org/broadinstitute/sting/gatk/iterators/IntervalOverlapIterator.java create mode 100644 java/test/org/broadinstitute/sting/gatk/iterators/IntervalOverlapIteratorTest.java diff --git a/java/src/org/broadinstitute/sting/gatk/datasources/simpleDataSources/SAMDataSource.java b/java/src/org/broadinstitute/sting/gatk/datasources/simpleDataSources/SAMDataSource.java index 5122b691c..2b90c0be9 100755 --- a/java/src/org/broadinstitute/sting/gatk/datasources/simpleDataSources/SAMDataSource.java +++ b/java/src/org/broadinstitute/sting/gatk/datasources/simpleDataSources/SAMDataSource.java @@ -52,7 +52,6 @@ import java.util.List; */ public class SAMDataSource implements SimpleDataSource { - /** Backing support for reads. */ private final Reads reads; @@ -63,7 +62,7 @@ public class SAMDataSource implements SimpleDataSource { long readsTaken = 0; // our last genome loc position - GenomeLoc lastReadPos = null; + protected GenomeLoc lastReadPos = null; // do we take unmapped reads private boolean includeUnmappedReads = true; diff --git a/java/src/org/broadinstitute/sting/gatk/iterators/IntervalOverlapIterator.java b/java/src/org/broadinstitute/sting/gatk/iterators/IntervalOverlapIterator.java new file mode 100644 index 000000000..8b4f197be --- /dev/null +++ b/java/src/org/broadinstitute/sting/gatk/iterators/IntervalOverlapIterator.java @@ -0,0 +1,109 @@ +package org.broadinstitute.sting.gatk.iterators; + +import net.sf.samtools.SAMRecord; +import org.broadinstitute.sting.gatk.Reads; +import org.broadinstitute.sting.utils.GenomeLoc; +import org.broadinstitute.sting.utils.GenomeLocParser; + +import java.util.Iterator; + + +/** + * + * @author aaron + * + * Class DuplicateDetectorIterator + * + * remove reads that overlap the passed in interval, yea! + * + */ +public class IntervalOverlapIterator implements StingSAMIterator { + + // our wrapped iterator + private final StingSAMIterator mIter; + private final boolean throwException; + + // storage for the next record + private SAMRecord mNextRecord = null; + + // the genomic location which we filter on + private final GenomeLoc mLoc; + + /** + * Create a DuplicateDetectorIterator from another sam iterator + * @param iter something that implements StingSAMIterator + * @param blowUpOnDup if we find a dup, do we throw an exception (blow up) or do we drop it + */ + public IntervalOverlapIterator(StingSAMIterator iter, GenomeLoc filterLocation, boolean blowUpOnDup) { + this.mIter = iter; + this.throwException = blowUpOnDup; + this.mLoc = filterLocation; + if (iter.hasNext()) { + next(); + } + } + + /** + * Gets source information for the reads. Contains information about the original reads + * files, plus information about downsampling, etc. + * + * @return + */ + public Reads getSourceInfo() { + return mIter.getSourceInfo(); + } + + /** + * close this iterator + */ + public void close() { + if (mIter != null) mIter.close(); + } + + /** + * do we have a next? + * @return true if yes, false if not + */ + public boolean hasNext() { + return (mNextRecord != null); + } + + /** + * get the next record + * @return a SAMRecord + */ + public SAMRecord next() { + SAMRecord ret = mNextRecord; + while (mIter.hasNext()) { + mNextRecord = mIter.next(); + if (!isOverlaping(mNextRecord)) return ret; + } + mNextRecord = null; + return ret; + } + + /** + * not supported + */ + public void remove() { + throw new UnsupportedOperationException("You can't call remove, so, like, I guess please don't"); + } + + /** + * create an iterator out of the this type + * @return this! + */ + public Iterator iterator() { + return this; + } + + /** + * determine if a read overlaps the specified interval that was passed in + * @param rec the read + * @return true if it overlaps, false otherwise + */ + private boolean isOverlaping(SAMRecord rec) { + return mLoc.overlapsP(GenomeLocParser.createGenomeLoc(rec)); + } + +} diff --git a/java/test/org/broadinstitute/sting/gatk/iterators/IntervalOverlapIteratorTest.java b/java/test/org/broadinstitute/sting/gatk/iterators/IntervalOverlapIteratorTest.java new file mode 100644 index 000000000..b691c331c --- /dev/null +++ b/java/test/org/broadinstitute/sting/gatk/iterators/IntervalOverlapIteratorTest.java @@ -0,0 +1,80 @@ +package org.broadinstitute.sting.gatk.iterators; + +import net.sf.samtools.SAMFileReader; +import net.sf.samtools.SAMRecord; +import org.broadinstitute.sting.BaseTest; +import org.broadinstitute.sting.utils.GenomeLoc; +import org.broadinstitute.sting.utils.GenomeLocParser; +import org.broadinstitute.sting.utils.fasta.IndexedFastaSequenceFile; +import org.junit.Assert; +import static org.junit.Assert.fail; +import org.junit.BeforeClass; +import org.junit.Test; + +import java.io.File; +import java.io.FileNotFoundException; +import java.util.Iterator; + + +/** + * @author aaron + *

+ * Class DuplicateDetectorIteratorTest + *

+ * test the DuplicateDetectorIterator class. + */ +public class IntervalOverlapIteratorTest extends BaseTest { + private final File bam = new File("/humgen/gsa-scr1/GATK_Data/Validation_Data/index_test.bam"); + private static IndexedFastaSequenceFile seq; + private int chromosomeOneReadCount = 885; + + //GenomeLoc.setupRefContigOrdering(seq.getSequenceDictionary()); + /** + * This function does the setup of our parser, before each method call. + *

+ * Called before every test case method. + */ + @BeforeClass + public static void beforeAll() { + try { + seq = new IndexedFastaSequenceFile(new File(seqLocation + "/references/Homo_sapiens_assembly18/v0/Homo_sapiens_assembly18.fasta")); + } catch (FileNotFoundException e) { + fail("Unexpected Exception" + e.getLocalizedMessage()); + } + GenomeLocParser.setupRefContigOrdering(seq); + } + + @Test + public void testOverlappingIntervals() { + int countOfReads = 0; + int seqLength = seq.getSequenceDictionary().getSequence("chr1").getSequenceLength(); + GenomeLoc last = GenomeLocParser.createGenomeLoc("chr1", 1, 470535); + + // first count the initial pile of reads + SAMFileReader reader = new SAMFileReader(bam, true); + reader.setValidationStringency(SAMFileReader.ValidationStringency.LENIENT); + Iterator i = reader.queryOverlapping("chr1",1,470535); + GenomeLoc newLoc; + while (i.hasNext()) { + i.next(); + countOfReads++; + } + reader.close(); + while (last.getStart() < seq.getSequenceDictionary().getSequence("chr1").getSequenceLength()) { + reader = new SAMFileReader(bam, true); + long stop = (last.getStop() >= seqLength) ? seqLength : last.getStop() + 470535; + newLoc = GenomeLocParser.createGenomeLoc(last.getContigIndex(),last.getStart()+470535,stop); + reader.setValidationStringency(SAMFileReader.ValidationStringency.LENIENT); + i = reader.queryOverlapping(newLoc.getContig(),(int)newLoc.getStart(),(int)newLoc.getStop()); + IntervalOverlapIterator iter = new IntervalOverlapIterator(StingSAMIteratorAdapter.adapt(null, i),last,false); + while(iter.hasNext()) { + countOfReads++; + iter.next(); + } + last = newLoc; + reader.close(); + + } + Assert.assertEquals(chromosomeOneReadCount,countOfReads); + } +}