an iterator to fix the problem where read-based interval traversals are getting duplicate reads because reads span the two intervals.

git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@1305 348d0f76-0448-11de-a6fe-93d51630548a
This commit is contained in:
aaron 2009-07-23 23:59:48 +00:00
parent 7c20be157c
commit 0b16253db3
3 changed files with 190 additions and 2 deletions

View File

@ -52,7 +52,6 @@ import java.util.List;
*/
public class SAMDataSource implements SimpleDataSource {
/** Backing support for reads. */
private final Reads reads;
@ -63,7 +62,7 @@ public class SAMDataSource implements SimpleDataSource {
long readsTaken = 0;
// our last genome loc position
GenomeLoc lastReadPos = null;
protected GenomeLoc lastReadPos = null;
// do we take unmapped reads
private boolean includeUnmappedReads = true;

View File

@ -0,0 +1,109 @@
package org.broadinstitute.sting.gatk.iterators;
import net.sf.samtools.SAMRecord;
import org.broadinstitute.sting.gatk.Reads;
import org.broadinstitute.sting.utils.GenomeLoc;
import org.broadinstitute.sting.utils.GenomeLocParser;
import java.util.Iterator;
/**
*
* @author aaron
*
* Class DuplicateDetectorIterator
*
* remove reads that overlap the passed in interval, yea!
*
*/
public class IntervalOverlapIterator implements StingSAMIterator {
// our wrapped iterator
private final StingSAMIterator mIter;
private final boolean throwException;
// storage for the next record
private SAMRecord mNextRecord = null;
// the genomic location which we filter on
private final GenomeLoc mLoc;
/**
* Create a DuplicateDetectorIterator from another sam iterator
* @param iter something that implements StingSAMIterator
* @param blowUpOnDup if we find a dup, do we throw an exception (blow up) or do we drop it
*/
public IntervalOverlapIterator(StingSAMIterator iter, GenomeLoc filterLocation, boolean blowUpOnDup) {
this.mIter = iter;
this.throwException = blowUpOnDup;
this.mLoc = filterLocation;
if (iter.hasNext()) {
next();
}
}
/**
* Gets source information for the reads. Contains information about the original reads
* files, plus information about downsampling, etc.
*
* @return
*/
public Reads getSourceInfo() {
return mIter.getSourceInfo();
}
/**
* close this iterator
*/
public void close() {
if (mIter != null) mIter.close();
}
/**
* do we have a next?
* @return true if yes, false if not
*/
public boolean hasNext() {
return (mNextRecord != null);
}
/**
* get the next record
* @return a SAMRecord
*/
public SAMRecord next() {
SAMRecord ret = mNextRecord;
while (mIter.hasNext()) {
mNextRecord = mIter.next();
if (!isOverlaping(mNextRecord)) return ret;
}
mNextRecord = null;
return ret;
}
/**
* not supported
*/
public void remove() {
throw new UnsupportedOperationException("You can't call remove, so, like, I guess please don't");
}
/**
* create an iterator out of the this type
* @return this!
*/
public Iterator<SAMRecord> iterator() {
return this;
}
/**
* determine if a read overlaps the specified interval that was passed in
* @param rec the read
* @return true if it overlaps, false otherwise
*/
private boolean isOverlaping(SAMRecord rec) {
return mLoc.overlapsP(GenomeLocParser.createGenomeLoc(rec));
}
}

View File

@ -0,0 +1,80 @@
package org.broadinstitute.sting.gatk.iterators;
import net.sf.samtools.SAMFileReader;
import net.sf.samtools.SAMRecord;
import org.broadinstitute.sting.BaseTest;
import org.broadinstitute.sting.utils.GenomeLoc;
import org.broadinstitute.sting.utils.GenomeLocParser;
import org.broadinstitute.sting.utils.fasta.IndexedFastaSequenceFile;
import org.junit.Assert;
import static org.junit.Assert.fail;
import org.junit.BeforeClass;
import org.junit.Test;
import java.io.File;
import java.io.FileNotFoundException;
import java.util.Iterator;
/**
* @author aaron
* <p/>
* Class DuplicateDetectorIteratorTest
* <p/>
* test the DuplicateDetectorIterator class.
*/
public class IntervalOverlapIteratorTest extends BaseTest {
private final File bam = new File("/humgen/gsa-scr1/GATK_Data/Validation_Data/index_test.bam");
private static IndexedFastaSequenceFile seq;
private int chromosomeOneReadCount = 885;
//GenomeLoc.setupRefContigOrdering(seq.getSequenceDictionary());
/**
* This function does the setup of our parser, before each method call.
* <p/>
* Called before every test case method.
*/
@BeforeClass
public static void beforeAll() {
try {
seq = new IndexedFastaSequenceFile(new File(seqLocation + "/references/Homo_sapiens_assembly18/v0/Homo_sapiens_assembly18.fasta"));
} catch (FileNotFoundException e) {
fail("Unexpected Exception" + e.getLocalizedMessage());
}
GenomeLocParser.setupRefContigOrdering(seq);
}
@Test
public void testOverlappingIntervals() {
int countOfReads = 0;
int seqLength = seq.getSequenceDictionary().getSequence("chr1").getSequenceLength();
GenomeLoc last = GenomeLocParser.createGenomeLoc("chr1", 1, 470535);
// first count the initial pile of reads
SAMFileReader reader = new SAMFileReader(bam, true);
reader.setValidationStringency(SAMFileReader.ValidationStringency.LENIENT);
Iterator<SAMRecord> i = reader.queryOverlapping("chr1",1,470535);
GenomeLoc newLoc;
while (i.hasNext()) {
i.next();
countOfReads++;
}
reader.close();
while (last.getStart() < seq.getSequenceDictionary().getSequence("chr1").getSequenceLength()) {
reader = new SAMFileReader(bam, true);
long stop = (last.getStop() >= seqLength) ? seqLength : last.getStop() + 470535;
newLoc = GenomeLocParser.createGenomeLoc(last.getContigIndex(),last.getStart()+470535,stop);
reader.setValidationStringency(SAMFileReader.ValidationStringency.LENIENT);
i = reader.queryOverlapping(newLoc.getContig(),(int)newLoc.getStart(),(int)newLoc.getStop());
IntervalOverlapIterator iter = new IntervalOverlapIterator(StingSAMIteratorAdapter.adapt(null, i),last,false);
while(iter.hasNext()) {
countOfReads++;
iter.next();
}
last = newLoc;
reader.close();
}
Assert.assertEquals(chromosomeOneReadCount,countOfReads);
}
}