an iterator to fix the problem where read-based interval traversals are getting duplicate reads because reads span the two intervals.
git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@1305 348d0f76-0448-11de-a6fe-93d51630548a
This commit is contained in:
parent
7c20be157c
commit
0b16253db3
|
|
@ -52,7 +52,6 @@ import java.util.List;
|
|||
*/
|
||||
public class SAMDataSource implements SimpleDataSource {
|
||||
|
||||
|
||||
/** Backing support for reads. */
|
||||
private final Reads reads;
|
||||
|
||||
|
|
@ -63,7 +62,7 @@ public class SAMDataSource implements SimpleDataSource {
|
|||
long readsTaken = 0;
|
||||
|
||||
// our last genome loc position
|
||||
GenomeLoc lastReadPos = null;
|
||||
protected GenomeLoc lastReadPos = null;
|
||||
|
||||
// do we take unmapped reads
|
||||
private boolean includeUnmappedReads = true;
|
||||
|
|
|
|||
|
|
@ -0,0 +1,109 @@
|
|||
package org.broadinstitute.sting.gatk.iterators;
|
||||
|
||||
import net.sf.samtools.SAMRecord;
|
||||
import org.broadinstitute.sting.gatk.Reads;
|
||||
import org.broadinstitute.sting.utils.GenomeLoc;
|
||||
import org.broadinstitute.sting.utils.GenomeLocParser;
|
||||
|
||||
import java.util.Iterator;
|
||||
|
||||
|
||||
/**
|
||||
*
|
||||
* @author aaron
|
||||
*
|
||||
* Class DuplicateDetectorIterator
|
||||
*
|
||||
* remove reads that overlap the passed in interval, yea!
|
||||
*
|
||||
*/
|
||||
public class IntervalOverlapIterator implements StingSAMIterator {
|
||||
|
||||
// our wrapped iterator
|
||||
private final StingSAMIterator mIter;
|
||||
private final boolean throwException;
|
||||
|
||||
// storage for the next record
|
||||
private SAMRecord mNextRecord = null;
|
||||
|
||||
// the genomic location which we filter on
|
||||
private final GenomeLoc mLoc;
|
||||
|
||||
/**
|
||||
* Create a DuplicateDetectorIterator from another sam iterator
|
||||
* @param iter something that implements StingSAMIterator
|
||||
* @param blowUpOnDup if we find a dup, do we throw an exception (blow up) or do we drop it
|
||||
*/
|
||||
public IntervalOverlapIterator(StingSAMIterator iter, GenomeLoc filterLocation, boolean blowUpOnDup) {
|
||||
this.mIter = iter;
|
||||
this.throwException = blowUpOnDup;
|
||||
this.mLoc = filterLocation;
|
||||
if (iter.hasNext()) {
|
||||
next();
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Gets source information for the reads. Contains information about the original reads
|
||||
* files, plus information about downsampling, etc.
|
||||
*
|
||||
* @return
|
||||
*/
|
||||
public Reads getSourceInfo() {
|
||||
return mIter.getSourceInfo();
|
||||
}
|
||||
|
||||
/**
|
||||
* close this iterator
|
||||
*/
|
||||
public void close() {
|
||||
if (mIter != null) mIter.close();
|
||||
}
|
||||
|
||||
/**
|
||||
* do we have a next?
|
||||
* @return true if yes, false if not
|
||||
*/
|
||||
public boolean hasNext() {
|
||||
return (mNextRecord != null);
|
||||
}
|
||||
|
||||
/**
|
||||
* get the next record
|
||||
* @return a SAMRecord
|
||||
*/
|
||||
public SAMRecord next() {
|
||||
SAMRecord ret = mNextRecord;
|
||||
while (mIter.hasNext()) {
|
||||
mNextRecord = mIter.next();
|
||||
if (!isOverlaping(mNextRecord)) return ret;
|
||||
}
|
||||
mNextRecord = null;
|
||||
return ret;
|
||||
}
|
||||
|
||||
/**
|
||||
* not supported
|
||||
*/
|
||||
public void remove() {
|
||||
throw new UnsupportedOperationException("You can't call remove, so, like, I guess please don't");
|
||||
}
|
||||
|
||||
/**
|
||||
* create an iterator out of the this type
|
||||
* @return this!
|
||||
*/
|
||||
public Iterator<SAMRecord> iterator() {
|
||||
return this;
|
||||
}
|
||||
|
||||
/**
|
||||
* determine if a read overlaps the specified interval that was passed in
|
||||
* @param rec the read
|
||||
* @return true if it overlaps, false otherwise
|
||||
*/
|
||||
private boolean isOverlaping(SAMRecord rec) {
|
||||
return mLoc.overlapsP(GenomeLocParser.createGenomeLoc(rec));
|
||||
}
|
||||
|
||||
}
|
||||
|
|
@ -0,0 +1,80 @@
|
|||
package org.broadinstitute.sting.gatk.iterators;
|
||||
|
||||
import net.sf.samtools.SAMFileReader;
|
||||
import net.sf.samtools.SAMRecord;
|
||||
import org.broadinstitute.sting.BaseTest;
|
||||
import org.broadinstitute.sting.utils.GenomeLoc;
|
||||
import org.broadinstitute.sting.utils.GenomeLocParser;
|
||||
import org.broadinstitute.sting.utils.fasta.IndexedFastaSequenceFile;
|
||||
import org.junit.Assert;
|
||||
import static org.junit.Assert.fail;
|
||||
import org.junit.BeforeClass;
|
||||
import org.junit.Test;
|
||||
|
||||
import java.io.File;
|
||||
import java.io.FileNotFoundException;
|
||||
import java.util.Iterator;
|
||||
|
||||
|
||||
/**
|
||||
* @author aaron
|
||||
* <p/>
|
||||
* Class DuplicateDetectorIteratorTest
|
||||
* <p/>
|
||||
* test the DuplicateDetectorIterator class.
|
||||
*/
|
||||
public class IntervalOverlapIteratorTest extends BaseTest {
|
||||
private final File bam = new File("/humgen/gsa-scr1/GATK_Data/Validation_Data/index_test.bam");
|
||||
private static IndexedFastaSequenceFile seq;
|
||||
private int chromosomeOneReadCount = 885;
|
||||
|
||||
//GenomeLoc.setupRefContigOrdering(seq.getSequenceDictionary());
|
||||
/**
|
||||
* This function does the setup of our parser, before each method call.
|
||||
* <p/>
|
||||
* Called before every test case method.
|
||||
*/
|
||||
@BeforeClass
|
||||
public static void beforeAll() {
|
||||
try {
|
||||
seq = new IndexedFastaSequenceFile(new File(seqLocation + "/references/Homo_sapiens_assembly18/v0/Homo_sapiens_assembly18.fasta"));
|
||||
} catch (FileNotFoundException e) {
|
||||
fail("Unexpected Exception" + e.getLocalizedMessage());
|
||||
}
|
||||
GenomeLocParser.setupRefContigOrdering(seq);
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testOverlappingIntervals() {
|
||||
int countOfReads = 0;
|
||||
int seqLength = seq.getSequenceDictionary().getSequence("chr1").getSequenceLength();
|
||||
GenomeLoc last = GenomeLocParser.createGenomeLoc("chr1", 1, 470535);
|
||||
|
||||
// first count the initial pile of reads
|
||||
SAMFileReader reader = new SAMFileReader(bam, true);
|
||||
reader.setValidationStringency(SAMFileReader.ValidationStringency.LENIENT);
|
||||
Iterator<SAMRecord> i = reader.queryOverlapping("chr1",1,470535);
|
||||
GenomeLoc newLoc;
|
||||
while (i.hasNext()) {
|
||||
i.next();
|
||||
countOfReads++;
|
||||
}
|
||||
reader.close();
|
||||
while (last.getStart() < seq.getSequenceDictionary().getSequence("chr1").getSequenceLength()) {
|
||||
reader = new SAMFileReader(bam, true);
|
||||
long stop = (last.getStop() >= seqLength) ? seqLength : last.getStop() + 470535;
|
||||
newLoc = GenomeLocParser.createGenomeLoc(last.getContigIndex(),last.getStart()+470535,stop);
|
||||
reader.setValidationStringency(SAMFileReader.ValidationStringency.LENIENT);
|
||||
i = reader.queryOverlapping(newLoc.getContig(),(int)newLoc.getStart(),(int)newLoc.getStop());
|
||||
IntervalOverlapIterator iter = new IntervalOverlapIterator(StingSAMIteratorAdapter.adapt(null, i),last,false);
|
||||
while(iter.hasNext()) {
|
||||
countOfReads++;
|
||||
iter.next();
|
||||
}
|
||||
last = newLoc;
|
||||
reader.close();
|
||||
|
||||
}
|
||||
Assert.assertEquals(chromosomeOneReadCount,countOfReads);
|
||||
}
|
||||
}
|
||||
Loading…
Reference in New Issue